In [None]:
import numpy as np
import numba
import umap
import pynndescent

print("NumPy version:", np.__version__)
print("Numba version:", numba.__version__)
print("UMAP version:", umap.__version__)
print("PyNNDescent version:", pynndescent.__version__)


In [1]:
import os
import json
import ale_py

import torch as T
import torch.nn as nn
from torch import optim
import numpy as np
# import pandas as pd
# from umap import UMAP


import torch_utils
from torch import distributions

import gymnasium as gym
import gymnasium_robotics as gym_robo
# import models
from models import ValueModel, StochasticContinuousPolicy, ActorModel, StochasticDiscretePolicy
import cnn_models
from rl_agents import PPO, DDPG, TD3, Reinforce, ActorCritic, HER
import rl_callbacks
from rl_callbacks import WandbCallback
from helper import Normalizer
import gym_helper
import wandb_support
import wandb
import gym_helper

# from mpi4py import MPI

error: XDG_RUNTIME_DIR is invalid or not set in the environment.
[5ffba6a3ce68:01336] shmem: mmap: an error occurred while determining whether or not /tmp/ompi.5ffba6a3ce68.0/jf.0/580714496/shared_mem_cuda_pool.5ffba6a3ce68 could be created.
[5ffba6a3ce68:01336] create_and_attach: unable to create shared memory BTL coordinating structure :: size 134217728 


In [None]:
import mujoco

In [None]:
mujoco.MjModel

In [None]:
gym_robo.__version__

In [None]:
def check_cuda():
    cuda_available = T.cuda.is_available()
    if cuda_available:
        print("CUDA is available.")
        num_gpus = T.cuda.device_count()
        print(f"Number of GPUs detected: {num_gpus}")
        
        for i in range(num_gpus):
            gpu_name = T.cuda.get_device_name(i)
            gpu_memory = T.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # Convert bytes to GB
            print(f"GPU {i}: {gpu_name}")
            print(f"Total memory: {gpu_memory:.2f} GB")
    else:
        print("CUDA is not available.")

check_cuda()

In [None]:
def get_default_device():
    """Returns the default device for computations, GPU if available, otherwise CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()
print(f"Using device: {device}")

# TEST

In [None]:
gym_robo.register_robotics_envs()

In [None]:
gym.envs.registration.registry

In [None]:
wandb.login(key='758ac5ba01e12a3df504d2db2fec8ba4f391f7e6')

In [None]:
env = gym.make('FetchPush-v2', max_episode_steps=100, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(env, 'test/', episode_trigger=lambda i: i%1==0)

episodes = 10


for episode in range(episodes):
    done = False
    obs, _ = env.reset()
    while not done:
        obs, r, term, trunc, dict = env.step(env.action_space.sample())
        if term or trunc:
            done = True
env.close()

In [None]:
env = gym.make("FetchReach-v2")
env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

# The following always has to hold:
assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
assert truncated == env.compute_truncated(obs["achieved_goal"], obs["desired_goal"], info)
assert terminated == env.compute_terminated(obs["achieved_goal"], obs["desired_goal"], info)

In [None]:
env.compute_reward()

In [None]:
env = gym.make('FetchPush-v2', render_mode='rgb_array')

In [None]:
if hasattr(env, "distance_threshold"):
    print('true')
else:
    print('false')

In [None]:
if env.get_wrapper_attr("distance_threshold"):
    print('true')

In [None]:
print(dir(env))


# DDPG

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize_layers=True)

In [None]:
actor

In [None]:
ddpg_agent.actor_model

In [None]:
ddpg_agent.target_actor_model

In [None]:
# build critic

state_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

merged_layers = [
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers,
                            optimizer='Adam', optimizer_params={'weight_decay':0.01}, learning_rate=0.002, normalize_layers=True)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000)
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
ddpg_agent.critic_model

In [None]:
ddpg_agent.target_critic_model

In [None]:
ddpg_agent.train(100, True, 10)

In [None]:
ddpg_agent.test(10, True, 1)

# Actor Critic

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', "kaiming normal"),
    (256, 'relu', "kaiming normal"),
    ]



In [None]:
policy_model = models.PolicyModel(env=env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001,)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
value_model = models.ValueModel(env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001)

In [None]:
value_model

In [None]:
for params in value_model.parameters():
    print(params)

In [None]:
actor_critic = rl_agents.ActorCritic(env,
                                     policy_model,
                                     value_model,
                                     discount=0.99,
                                     policy_trace_decay=0.5,
                                     value_trace_decay=0.5,
                                     callbacks=[rl_callbacks.WandbCallback('CartPole-v1-Actor-Critic')])

In [None]:
actor_critic.train(200)

In [None]:
actor_critic.test(10, True, 1)

# REINFORCE

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', {
                    "kaiming normal": {
                        "a":1.0,
                        "mode":'fan_in'
                    }
                },
    ),
    # (256, 'relu', {
    #                 "kaiming_normal": {
    #                     "a":0.0,
    #                     "mode":'fan_in'
    #                 }
    #             },
    # )
    ]

In [None]:
dense_layers = [(128, 'relu', "kaiming normal")]

In [None]:
value_model = models.ValueModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in value_model.parameters():
    print(param)

In [None]:
policy_model = models.PolicyModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
reinforce = rl_agents.Reinforce(env, policy_model, value_model, 0.99, [rl_callbacks.WandbCallback('CartPole-v0_REINFORCE', chkpt_freq=100)])

In [None]:
reinforce.train(200, True, 50)

In [None]:
reinforce.test(10, True, 1)

# DDPG w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

In [None]:
cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
cnn

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env, cnn_model=cnn, dense_layers=dense_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=cnn, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape=(1,))
noise = helper.OUNoise(shape=env.action_space.shape, mean=0.0, theta=0.15, sigma=0.01, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(
    env,
    actor,
    critic,
    discount=0.98,
    tau=0.05,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    callbacks=[rl_callbacks.WandbCallback("CarRacing-v2")]
)

In [None]:
ddpg_agent.train(1000, True, 10)

In [None]:
wandb.finish()

In [None]:
wandb.login()

# HER

In [None]:
env = gym.make("Reacher-v4")

In [None]:
_,_ = env.reset()

In [None]:
achieved_goal = gym_helper.reacher_achieved_goal(env)
action = env.action_space.sample()
env.step(action)
print(f'observation: {env.get_wrapper_attr("_get_obs")()}')
print(f'distance to goal: {env.get_wrapper_attr("_get_obs")()[8::]}')
print(f'fingertip: {env.get_wrapper_attr("get_body_com")("fingertip")}')
print(f'target: {env.get_wrapper_attr("get_body_com")("target")}')

In [None]:
next_achieved_goal = env.get_wrapper_attr("_get_obs")()[8::]
desired_goal = [0.0, 0.0, 0.0]

In [None]:
reward_func(env, action, achieved_goal, next_achieved_goal, desired_goal, 0.05)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=None,
                          dense_layers=dense_layers,
                          goal_shape=(3,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.0001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=None,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(3,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.0001,
                            normalize=False)

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Reacher-v4')])

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=0.001,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.train(10, 50, 16, 40, True, 1000)

In [None]:
wandb.finish()

In [None]:
her.test(10, True, 1)

In [None]:
her.save()

In [None]:
her.agent.goal_normalizer.running_std

In [None]:
loaded_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
loaded_her.agent.replay_buffer.sample(10)

In [None]:
loaded_her.agent.state_normalizer.running_cnt

In [None]:
loaded_her.get_config()

In [None]:
loaded_her.test(10, True, 1)

In [None]:
10e4

# HER w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
_,_ = env.reset()

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal(env).shape

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=cnn,
                          dense_layers=dense_layers,
                          goal_shape=(1,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=cnn,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(1,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.001,
                            normalize=False)

In [None]:
critic

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('CarRacing-v2')])

In [None]:
ddpg_agent.actor_model

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=1,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.agent.actor_model

In [None]:
her.train(num_epochs=20,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=20
        )

In [None]:
her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/models/her")

In [None]:
wandb.finish()

In [None]:
# reset environment
state, _ = her.agent.env.reset()
# instantiate empty lists to store current episode trajectory
states, actions, next_states, dones, state_achieved_goals, \
next_state_achieved_goals, desired_goals = [], [], [], [], [], [], []
# set desired goal
desired_goal = her.desired_goal_func(her.agent.env)
# set achieved goal
state_achieved_goal = her.achieved_goal_func(her.agent.env)
# add initial state and goals to local normalizer stats
her.state_normalizer.update_local_stats(state)
her.goal_normalizer.update_local_stats(desired_goal)
her.goal_normalizer.update_local_stats(state_achieved_goal)
# set done flag
done = False
# reset episode reward to 0
episode_reward = 0
# reset steps counter for the episode
episode_steps = 0

while not done:
    # get normalized values for state and desired goal
    state_norm = her.state_normalizer.normalize(state)
    desired_goal_norm = her.goal_normalizer.normalize(desired_goal)
    # get action
    action = her.agent.get_action(state_norm, desired_goal_norm, grad=False)
    # take action
    next_state, reward, term, trunc, _ = her.agent.env.step(action)
    # get next state achieved goal
    next_state_achieved_goal = her.achieved_goal_func(her.agent.env)
    # add next state and next state achieved goal to normalizers
    her.state_normalizer.update_local_stats(next_state)
    her.goal_normalizer.update_local_stats(next_state_achieved_goal)
    # store trajectory in replay buffer (non normalized!)
    her.agent.replay_buffer.add(state, action, reward, next_state, done,\
                                    state_achieved_goal, next_state_achieved_goal, desired_goal)
    
    # append step state, action, next state, and goals to respective lists
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    dones.append(done)
    state_achieved_goals.append(state_achieved_goal)
    next_state_achieved_goals.append(next_state_achieved_goal)
    desired_goals.append(desired_goal)

    # add to episode reward and increment steps counter
    episode_reward += reward
    episode_steps += 1
    # update state and state achieved goal
    state = next_state
    state_achieved_goal = next_state_achieved_goal
    # update done flag
    if term or trunc:
        done = True

In [None]:
# package episode states, actions, next states, and goals into trajectory tuple
trajectory = (states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)

In [None]:
states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals = trajectory

In [None]:
for idx, (s, a, ns, d, sag, nsag, dg) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):
    print(f'a={a}, d={d}, sag={sag}, nsag={nsag}, dg={dg}')

In [None]:
strategy = "future"
num_goals = 4

# loop over each step in the trajectory to set new achieved goals, calculate new reward, and save to replay buffer
for idx, (state, action, next_state, done, state_achieved_goal, next_state_achieved_goal, desired_goal) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):

    if strategy == "final":
        new_desired_goal = next_state_achieved_goals[-1]
        new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
        print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
        her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)

    if strategy == 'future':
        for i in range(num_goals):
            if idx + i + 1 >= len(states):
                break
            goal_idx = np.random.randint(idx + 1, len(states))
            new_desired_goal = next_state_achieved_goals[goal_idx]
            new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
            print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
            her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)
    

    


In [None]:
s, a, r, ns, d, sag, nsag, dg = her.agent.replay_buffer.sample(100)

In [None]:
for i in range(100):
    print(f'{i}: a={a[i]}, r={r[i]}, d={d[i]}, sag={sag[i]}, nsag={nsag[i]}, dg={dg[i]} ')

# HER Pendulum

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.001, normalize=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, (3,))
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
def desired_goal_func(env):
    return np.array([0.0, 0.0, 0.0])

def achieved_goal_func(env):
    return env.get_wrapper_attr('_get_obs')()

def reward_func(env):
    pass

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='none',
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=10.0
)

In [None]:
her.agent.critic_model

In [None]:
her.agent.target_critic_model

In [None]:
her.train(1,1,100,1)

In [None]:
wandb.finish()

In [None]:
state = env.observation_space.sample()
state

In [None]:
her.agent.state_normalizer.normalize(state)

In [None]:
goal = her.desired_goal_func(her.agent.env)
goal

In [None]:
her.agent.goal_normalizer.normalize(goal)

In [None]:
def remove_renders(folder_path):
    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .mp4 or .meta.json extension
        if filename.endswith(".mp4") or filename.endswith(".meta.json"):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Remove the file
            os.remove(file_path)

In [None]:
remove_renders("/workspaces/RL_Agents/pytorch/src/app/assets/models/ddpg/renders/training")

# HER Fetch-Reach (Robotics)

In [None]:
env = gym.make("FetchReach-v2", max_episode_steps=50)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
achieved_goal_func(env)

In [None]:
env.get_wrapper_attr("_get_obs")()

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
goal_shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchReach-v2")])

In [None]:
ddpg_agent.critic_model

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

In [None]:
states, action, rewards, next_states, dones, achieved_goals, next_achieved_goals, desired_goals = her.agent.replay_buffer.sample(2)

In [None]:
desired_goals

In [None]:
her.agent.env.get_wrapper_attr("distance_threshold")

In [None]:
# get success
her.agent.env.get_wrapper_attr("_is_success")(achieved_goal_func(her.agent.env), desired_goal_func(her.agent.env))

In [None]:
her.agent.env.get_wrapper_attr("goal_distance")(next_state_achieved_goal, desired_goal, None)

In [None]:
pusher_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
pusher_her.agent.env.reset()

In [None]:
pusher_her.get_config()

In [None]:
wandb.finish()

In [None]:
np.linalg.norm(pusher_her.agent.env.get_wrapper_attr("get_body_com")("goal") - pusher_her.agent.env.get_wrapper_attr("get_body_com")("object"))

In [None]:
pusher_her.agent.replay_buffer.get_config()

In [None]:

pusher_her.agent.replay_buffer.desired_goals

In [None]:
## TEST ENV
env = gym.make("Pusher-v5", render_mode="rgb_array")

In [None]:
env = gym.wrappers.RecordVideo(
                    env,
                    "/renders/training",
                    episode_trigger=lambda x: True,
                )


In [None]:
state, _ = env.reset()

for i in range(1000):
# take action
    next_state, reward, term, trunc, _ = env.step(env.action_space.sample())
env.close()

# HER Fetch Push (Robitics)

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

# TESTING MULTITHREADING

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    num_workers=4,
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train()

# TESTING

In [None]:
# load config
config_path = "/workspaces/RL_Agents/pytorch/src/app/HER_Test/her/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

In [None]:
config

In [None]:
agent = rl_agents.HER.load(config)

In [None]:
for callback in agent.agent.callbacks:
    print(callback._sweep)

# Co Occurence

In [None]:
import subprocess

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/wandb_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    wandb_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(wandb_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Save the updated configuration to a train config file
os.makedirs('sweep', exist_ok=True)
train_config_path = os.path.join(os.getcwd(), 'sweep/train_config.json')
with open(train_config_path, 'w') as f:
    json.dump(sweep_config, f)

# Save and Set the sweep config path
sweep_config_path = os.path.join(os.getcwd(), 'sweep/sweep_config.json')
with open(sweep_config_path, 'w') as f:
    json.dump(wandb_config, f)

In [None]:
command = ['python', 'sweep.py']

# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

subprocess.Popen(command)

In [None]:
# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/train_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    train_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(train_config)

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, project=sweep_config["project"])
# loop over num wandb agents
num_agents = 1
# for agent in range(num_agents):
wandb.agent(
    sweep_id,
    function=lambda: wandb_support._run_sweep(sweep_config, train_config,),
    count=train_config['num_sweeps'],
    project=sweep_config["project"],
)

In [None]:
sweep_config

# PPO

In [None]:
from pathlib import Path
from typing import List, Tuple
import torch.nn.functional as F
from torch.distributions import Categorical, Beta, Normal, kl_divergence
import time
import cv2

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.1
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid1 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_1 = ppo_agent_hybrid1.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.01
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [2]:
## PARAMS ##
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
# env_id = 'Humanoid-v5'
# env_id = "Reacher-v5"
# env_id = "Walker2d-v5"
# env_id = 'ALE/SpaceInvaders-ram-v5'
# env_id = "CarRacing-v2"

timesteps = 1_000_000
trajectory_length = 2000
batch_size = 64
learning_epochs = 10
num_envs = 4
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.001
loss = 'hybrid'
kl_coeff = 0.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
grad_clip = 40.0
reward_clip = 1.0
lambda_ = 0.0
distribution = 'beta'
device = 'cuda'

# Render Settings
render_freq = 20

## WANDB ##
project_name = 'BipedalWalker-v3'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42
env = gym.make(env_id)

save_dir = "BipedalWalker"
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed

# Build policy model
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
layer_config = [
    # {'type': 'cnn', 'params': {'out_channels': 32, 'kernel_size': (8, 8), 'stride': 4, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (4, 4), 'stride': 2, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (3, 3), 'stride': 1, 'padding': 0}},
    # {'type': 'flatten'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
]
output_layer_kernel = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticContinuousPolicy(env, layer_config, output_layer_kernel, learning_rate=policy_lr, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer_kernel, learning_rate=value_lr, device=device)
ppo = PPO(env, policy, value_function, distribution=distribution, discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff,
          loss=loss, kl_coefficient=kl_coeff, normalize_advantages=normalize_advantages, normalize_values=normalize_values, value_normalizer_clip=norm_clip, policy_grad_clip=grad_clip,
          reward_clip=reward_clip, lambda_=lambda_, callbacks=callbacks, save_dir=save_dir,device=device)
hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)
# ppo.test(10,"ppo_test", 1)


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjasonhayes1987[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Rendering episode 0.0 during training...
seed value:42
episode number sent to renderer:0.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_0.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_0.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_0.0.mp4
episode rendered
Episode 1/1 - Score: [-99.85492624]
episode: [0. 0. 0. 1.]; total steps: 1000; episodes scores: [          nan           nan           nan -101.03860488]; avg score: -101.03860488181127
learning timestep: 2000
Policy Loss: -0.09057190269231796
Value Loss: 0.8735776543617249
Entropy: -0.017818227410316467
KL Divergence: 0.01843278668820858
episode: [1. 5. 2. 6.]; total steps: 2000; episodes scores: [ -80.32775573 -114.28282535  -97.89942664 -102.67133195]; avg score: -98.79533491880647
episode: [1. 5. 2. 8.]; total steps: 3000; episodes scores: [ -80.32775573 -114.28282535  -97.89942664 -112.5987752 ]; avg score: -101.2771957289266
learning timestep: 4000
Policy Loss: 0.031096884980797768
Value Loss: 0.5866125822067261
Entropy: -0.030824322253465652
KL Divergence: 0.03859567642211914
episode: [3. 6. 4. 9.]; total steps: 4000; episodes scores: [-117.39999403  -82.22523912 -112.37502366

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_20.0.mp4
episode rendered
Episode 1/1 - Score: [-100.6433355]
learning timestep: 12000
Policy Loss: -0.0846797302365303
Value Loss: 0.544185221195221
Entropy: -0.09277000278234482
KL Divergence: 0.09572620689868927
episode: [22. 18. 20. 16.]; total steps: 12000; episodes scores: [ -98.77749937  -98.93496061 -103.19835746 -110.08911576]; avg score: -102.74998330133394
episode: [25. 19. 22. 18.]; total steps: 13000; episodes scores: [-100.47691421  -75.21154965  -99.50660026  -99.71134226]; avg score: -93.7266015952051
learning timestep: 14000
Policy Loss: 0.09742354601621628
Value Loss: 0.12918342649936676
Entropy: -0.11107911169528961
KL Divergence: 0.1170923262834549
episode: [25. 21. 22. 20.]; total steps: 14000; episodes scores: [-100.47691421  -98.48433425  -99.50660026 -111.32540812]; avg score: -102.44831421038151
episode: [27. 21. 27. 29.]; total steps: 15000; episodes scores: [-100.29194199  -98.4843

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_40.0.mp4
episode rendered
Episode 1/1 - Score: [-100.73639117]
learning timestep: 22000
Policy Loss: -0.06146393343806267
Value Loss: 0.5491770505905151
Entropy: -0.2332465499639511
KL Divergence: 0.303682804107666
episode: [41. 35. 39. 36.]; total steps: 22000; episodes scores: [-103.076917    -70.65138613  -99.0159316   -69.07000928]; avg score: -85.45356100114452
episode: [41. 40. 39. 37.]; total steps: 23000; episodes scores: [-103.076917    -99.43222064  -99.0159316   -68.72426229]; avg score: -92.56233288343375
learning timestep: 24000
Policy Loss: -0.03518760949373245
Value Loss: 0.6722046732902527
Entropy: -0.2695491909980774
KL Divergence: 0.2790902256965637
episode: [45. 42. 46. 41.]; total steps: 24000; episodes scores: [-103.26025268 -102.03156492 -101.68352967  -98.51990562]; avg score: -101.37381322263506
episode: [45. 46. 46. 44.]; total steps: 25000; episodes scores: [-103.26025268 -100.80880

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_60.0.mp4
episode rendered
Episode 1/1 - Score: [-100.8959374]
episode: [61. 56. 60. 60.]; total steps: 31000; episodes scores: [ -97.70474976 -101.36556225 -101.18869899  -98.63153305]; avg score: -99.72263601405608
learning timestep: 32000
Policy Loss: 0.206534281373024
Value Loss: 0.8253991007804871
Entropy: -0.3275437355041504
KL Divergence: 0.44670218229293823
episode: [73. 59. 63. 67.]; total steps: 32000; episodes scores: [ -99.76154661  -98.26483013 -121.51909062 -120.84675714]; avg score: -110.098056124735
episode: [76. 72. 64. 67.]; total steps: 33000; episodes scores: [-100.01585605 -101.46574274  -65.58283186 -120.84675714]; avg score: -96.97779694990908
learning timestep: 34000
Policy Loss: -0.06292789429426193
Value Loss: 0.21908555924892426
Entropy: -0.41284775733947754
KL Divergence: 0.48999178409576416
episode: [79. 74. 65. 69.]; total steps: 34000; episodes scores: [-100.51854348 -101.583193

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_80.0.mp4
episode rendered
Episode 1/1 - Score: [-101.80572007]
episode: [91. 79. 68. 73.]; total steps: 35000; episodes scores: [-101.27822294 -100.64195336 -101.83327306  -97.32836401]; avg score: -100.27045334448789
learning timestep: 36000
Policy Loss: -0.16398070752620697
Value Loss: 0.7818644046783447
Entropy: -0.34984922409057617
KL Divergence: 0.48864907026290894
episode: [91. 86. 68. 81.]; total steps: 36000; episodes scores: [-101.27822294  -99.10845415 -101.83327306  -99.1357926 ]; avg score: -100.33893568917456
episode: [98. 86. 76. 81.]; total steps: 37000; episodes scores: [-101.68668452  -99.10845415 -101.16859463  -99.1357926 ]; avg score: -100.27488147566172
Rendering episode 100.0 during training...
seed value:42
episode number sent to renderer:100.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_100.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/rend

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_100.0.mp4
episode rendered
Episode 1/1 - Score: [-100.24571092]
learning timestep: 38000
Policy Loss: -0.006258991546928883
Value Loss: 1.0293112993240356
Entropy: -0.3691323399543762
KL Divergence: 0.49678343534469604
episode: [112.  93.  90.  91.]; total steps: 38000; episodes scores: [-100.1737006   -97.88105818 -119.00414115 -102.3305255 ]; avg score: -104.84735635737066
episode: [112.  93.  94.  97.]; total steps: 39000; episodes scores: [-100.1737006   -97.88105818 -101.58725874  -99.47621308]; avg score: -99.77955764703125
learning timestep: 40000
Policy Loss: -0.09746932983398438
Value Loss: 0.8506950736045837
Entropy: -0.49663203954696655
KL Divergence: 0.7422729730606079
episode: [117. 100.  97.  98.]; total steps: 40000; episodes scores: [-118.23184692  -98.13095382  -99.69115083  -65.52005155]; avg score: -95.39350078068396
Rendering episode 120.0 during training...
seed value:42
episode number s

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_120.0.mp4
episode rendered
Episode 1/1 - Score: [-100.56075583]
episode: [131. 100. 110. 112.]; total steps: 41000; episodes scores: [-99.7796337  -98.13095382 -98.53050658 -99.25631814]; avg score: -98.92435305619381
Rendering episode 140.0 during training...
seed value:42
episode number sent to renderer:140.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_140.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_140.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_140.0.mp4
episode rendered
Episode 1/1 - Score: [-100.56075583]
learning timestep: 42000
Policy Loss: -0.010150564834475517
Value Loss: 0.7862306833267212
Entropy: -0.3787301480770111
KL Divergence: 0.538067102432251
episode: [140. 106. 110. 126.]; total steps: 42000; episodes scores: [ -99.25775404 -102.1487068   -98.53050658 -102.08902697]; avg score: -100.50649859629371
episode: [140. 106. 116. 139.]; total steps: 43000; episodes scores: [ -99.25775404 -102.1487068   -99.8019578   -97.10548523]; avg score: -99.57847596673544
learning timestep: 44000
Policy Loss: -0.014195664785802364
Value Loss: 0.8111121654510498
Entropy: -0.4361358880996704
KL Divergence: 0.6021593809127808
episode: [151. 116. 118. 153.]; total steps: 44000; episodes scores: [ -97.75491666  -99.82591216 -117.16668114  -97.74066312]; avg score: -103.12204326828802
Rendering episode 160.0 during training...
seed value:42
episode number se

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_160.0.mp4
episode rendered
Episode 1/1 - Score: [-101.3951093]




episode: [165. 130. 122. 168.]; total steps: 45000; episodes scores: [ -95.25207575 -100.57278474 -100.86177278 -101.01941314]; avg score: -99.42651160266095
learning timestep: 46000
Policy Loss: 0.255938321352005
Value Loss: 0.830669641494751
Entropy: -0.4136633276939392
KL Divergence: 0.6997597217559814
episode: [176. 145. 135. 181.]; total steps: 46000; episodes scores: [ -98.30949155  -98.50426016 -118.47105396  -99.13527117]; avg score: -103.60501920934365
episode: [176. 157. 149. 195.]; total steps: 47000; episodes scores: [-98.30949155 -96.2667027  -98.87847439 -99.53581048]; avg score: -98.24761977851686
Rendering episode 180.0 during training...
seed value:42
episode number sent to renderer:180.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_180.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_180.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_180.0.mp4
episode rendered
Episode 1/1 - Score: [-100.60072176]




learning timestep: 48000
Policy Loss: 0.024178197607398033
Value Loss: 1.2553355693817139
Entropy: -0.4060301184654236
KL Divergence: 0.6466089487075806
episode: [187. 159. 152. 209.]; total steps: 48000; episodes scores: [ -97.70066652  -99.22768939 -102.1549143   -99.61781964]; avg score: -99.67527246074347
episode: [193. 162. 155. 219.]; total steps: 49000; episodes scores: [ -98.7629263  -100.67606603  -97.65300382  -98.76299801]; avg score: -98.96374854109932
learning timestep: 50000
Policy Loss: -0.10868334770202637
Value Loss: 0.8332064747810364
Entropy: -0.507004976272583
KL Divergence: 0.7302166819572449
episode: [193. 174. 169. 231.]; total steps: 50000; episodes scores: [ -98.7629263  -100.04758049  -98.36158696  -97.74904914]; avg score: -98.730285725633
Rendering episode 200.0 during training...
seed value:42
episode number sent to renderer:200.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_200.0.mp4.
Moviepy - Writing video Bipedal

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_200.0.mp4
episode rendered
Episode 1/1 - Score: [-100.7509892]




episode: [206. 176. 184. 247.]; total steps: 51000; episodes scores: [-100.38306923  -98.35903515  -98.50268616  -99.92473429]; avg score: -99.29238120981321
Rendering episode 220.0 during training...
seed value:42
episode number sent to renderer:220.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_220.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_220.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_220.0.mp4
episode rendered
Episode 1/1 - Score: [-100.7509892]
learning timestep: 52000
Policy Loss: -0.04783152788877487
Value Loss: 1.0836371183395386
Entropy: -0.5331117510795593
KL Divergence: 0.7609465718269348
episode: [220. 181. 190. 253.]; total steps: 52000; episodes scores: [-100.26299334 -100.89722552  -99.11853801 -100.31437333]; avg score: -100.14828255087603
episode: [233. 194. 191. 254.]; total steps: 53000; episodes scores: [ -99.75987603 -102.52766382  -55.04097809  -60.32194163]; avg score: -79.41261488973738
Rendering episode 240.0 during training...
seed value:42
episode number sent to renderer:240.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_240.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_240.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_240.0.mp4
episode rendered
Episode 1/1 - Score: [-98.07470153]
learning timestep: 54000
Policy Loss: -0.12846939265727997
Value Loss: 0.8972062468528748
Entropy: -0.4789048433303833
KL Divergence: 0.7139427065849304
episode: [249. 196. 205. 268.]; total steps: 54000; episodes scores: [-101.33168741  -96.62330383  -99.55860388  -99.97292202]; avg score: -99.37162928453569
episode: [254. 201. 219. 282.]; total steps: 55000; episodes scores: [-98.3493436  -99.08252183 -99.93710978 -99.53014539]; avg score: -99.22478015066156
learning timestep: 56000
Policy Loss: -0.10200221836566925
Value Loss: 0.7011415958404541
Entropy: -0.5851373076438904
KL Divergence: 0.8165121078491211
episode: [256. 216. 233. 297.]; total steps: 56000; episodes scores: [-98.29787187 -98.92835907 -99.4304114  -99.2767305 ]; avg score: -98.9833432103498
Rendering episode 260.0 during training...
seed value:42
episode number sent to rendere

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_260.0.mp4
episode rendered
Episode 1/1 - Score: [-97.63647947]
episode: [269. 231. 248. 310.]; total steps: 57000; episodes scores: [ -96.21562279  -97.81522627  -99.70741115 -112.74243804]; avg score: -101.62017456065288
Rendering episode 280.0 during training...
seed value:42
episode number sent to renderer:280.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_280.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_280.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_280.0.mp4
episode rendered
Episode 1/1 - Score: [-97.63647947]
learning timestep: 58000
Policy Loss: 0.12792494893074036
Value Loss: 0.9494129419326782
Entropy: -0.5974745154380798
KL Divergence: 0.796268880367279
episode: [284. 246. 263. 314.]; total steps: 58000; episodes scores: [ -98.18714502  -98.77352565 -100.12947068 -100.77024923]; avg score: -99.46509764682486
episode: [290. 260. 268. 317.]; total steps: 59000; episodes scores: [ -97.85465172  -99.68987541 -100.62855595  -97.15555658]; avg score: -98.83215991444125
learning timestep: 60000
Policy Loss: 0.11166303604841232
Value Loss: 1.7440390586853027
Entropy: -0.6074353456497192
KL Divergence: 0.6759482026100159
episode: [290. 276. 269. 320.]; total steps: 60000; episodes scores: [ -97.85465172  -97.39324659  -58.27197389 -100.83293749]; avg score: -88.58820242140024
Rendering episode 300.0 during training...
seed value:42
episode number sent to r

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_300.0.mp4
episode rendered
Episode 1/1 - Score: [-100.82259508]
episode: [305. 291. 283. 325.]; total steps: 61000; episodes scores: [-100.59366199  -97.77552327  -99.24185747 -101.98681492]; avg score: -99.89946441138193
Rendering episode 320.0 during training...
seed value:42
episode number sent to renderer:320.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_320.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_320.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_320.0.mp4
episode rendered
Episode 1/1 - Score: [-100.82259508]




learning timestep: 62000
Policy Loss: -0.06549153476953506
Value Loss: 1.565735101699829
Entropy: -0.6927109956741333
KL Divergence: 0.8290867209434509
episode: [320. 307. 297. 340.]; total steps: 62000; episodes scores: [ -99.60871769  -97.64440285  -99.16263277 -101.25673452]; avg score: -99.41812195946216
episode: [335. 322. 313. 355.]; total steps: 63000; episodes scores: [ -96.55195381 -100.54608497 -100.91627541  -99.61377436]; avg score: -99.40702213474628
Rendering episode 340.0 during training...
seed value:42
episode number sent to renderer:340.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_340.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_340.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_340.0.mp4
episode rendered
Episode 1/1 - Score: [-101.38803741]
learning timestep: 64000
Policy Loss: 0.016720805317163467
Value Loss: 1.2367851734161377
Entropy: -0.6385188698768616
KL Divergence: 1.0850565433502197
episode: [352. 335. 326. 368.]; total steps: 64000; episodes scores: [-114.18095343  -99.60369006 -100.60785719 -100.62352055]; avg score: -103.75400530716715
episode: [359. 351. 341. 368.]; total steps: 65000; episodes scores: [ -97.30481394 -100.56039038 -101.37734394 -100.62352055]; avg score: -99.96651720348109
learning timestep: 66000
Policy Loss: -0.01709786430001259
Value Loss: 0.8753975629806519
Entropy: -0.7194682955741882
KL Divergence: 1.0036393404006958
episode: [359. 367. 356. 377.]; total steps: 66000; episodes scores: [-97.30481394 -97.74106923 -97.74454368 -99.97812376]; avg score: -98.19213765313708
Rendering episode 360.0 during training...
seed value:42
episode number sent to 

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_360.0.mp4
episode rendered
Episode 1/1 - Score: [-100.37449731]
episode: [374. 374. 357. 387.]; total steps: 67000; episodes scores: [-100.00743928  -99.49535765  -97.72507503  -95.34722164]; avg score: -98.14377340031011
Rendering episode 380.0 during training...
seed value:42
episode number sent to renderer:380.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_380.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_380.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_380.0.mp4
episode rendered
Episode 1/1 - Score: [-100.37449731]
learning timestep: 68000
Policy Loss: 0.11178010702133179
Value Loss: 1.1275432109832764
Entropy: -0.6340888738632202
KL Divergence: 1.0688974857330322
episode: [388. 384. 363. 387.]; total steps: 68000; episodes scores: [-99.39558407 -96.347625   -97.54763052 -95.34722164]; avg score: -97.15951530856748
Rendering episode 400.0 during training...
seed value:42
episode number sent to renderer:400.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_400.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_400.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_400.0.mp4
episode rendered
Episode 1/1 - Score: [-101.34649756]
episode: [403. 398. 376. 399.]; total steps: 69000; episodes scores: [-97.68446346 -99.4052258  -99.72743731 -97.47759463]; avg score: -98.57368030061758
learning timestep: 70000
Policy Loss: -0.12012387812137604
Value Loss: 1.2273722887039185
Entropy: -0.6713162064552307
KL Divergence: 0.8397504091262817
episode: [419. 414. 392. 416.]; total steps: 70000; episodes scores: [ -98.43163223 -101.45238683  -98.61881811  -97.99502569]; avg score: -99.12446571502369
Rendering episode 420.0 during training...
seed value:42
episode number sent to renderer:420.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_420.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_420.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_420.0.mp4
episode rendered
Episode 1/1 - Score: [-100.78012271]
episode: [434. 414. 409. 432.]; total steps: 71000; episodes scores: [ -99.28171332 -101.45238683  -99.35717013  -98.6417877 ]; avg score: -99.68326449123785
Rendering episode 440.0 during training...
seed value:42
episode number sent to renderer:440.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_440.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_440.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_440.0.mp4
episode rendered
Episode 1/1 - Score: [-100.78012271]




learning timestep: 72000
Policy Loss: 0.035673219710588455
Value Loss: 0.3437882661819458
Entropy: -0.6751226186752319
KL Divergence: 0.9053938388824463
episode: [450. 420. 409. 446.]; total steps: 72000; episodes scores: [ -99.46581291 -100.04467273  -99.35717013  -99.52357938]; avg score: -99.59780878749504
Rendering episode 460.0 during training...
seed value:42
episode number sent to renderer:460.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_460.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_460.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_460.0.mp4
episode rendered
Episode 1/1 - Score: [-100.20795843]




episode: [466. 436. 417. 462.]; total steps: 73000; episodes scores: [-100.30918346 -100.48334763  -99.76531928 -101.83240054]; avg score: -100.59756272477611
Rendering episode 480.0 during training...
seed value:42
episode number sent to renderer:480.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_480.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_480.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_480.0.mp4
episode rendered
Episode 1/1 - Score: [-100.20795843]
learning timestep: 74000
Policy Loss: 0.034034308046102524
Value Loss: 0.7217748165130615
Entropy: -0.8033021688461304
KL Divergence: 0.7603370547294617
episode: [483. 453. 435. 478.]; total steps: 74000; episodes scores: [ -98.79423163  -99.71793439 -100.38309424  -98.17159593]; avg score: -99.26671404901768
Rendering episode 500.0 during training...
seed value:42
episode number sent to renderer:500.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_500.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_500.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_500.0.mp4
episode rendered
Episode 1/1 - Score: [-100.08097413]
episode: [500. 469. 452. 494.]; total steps: 75000; episodes scores: [-100.44357283  -97.47851217 -100.83945542 -100.06822802]; avg score: -99.70744210788655
learning timestep: 76000
Policy Loss: -0.13444474339485168
Value Loss: 0.47512295842170715
Entropy: -0.7750436067581177
KL Divergence: 0.8874062299728394
episode: [517. 484. 468. 508.]; total steps: 76000; episodes scores: [ -99.49327594  -99.76959715 -101.34077992  -98.6699647 ]; avg score: -99.81840442540945
Rendering episode 520.0 during training...
seed value:42
episode number sent to renderer:520.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_520.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_520.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_520.0.mp4
episode rendered
Episode 1/1 - Score: [-100.60033967]
episode: [533. 501. 483. 525.]; total steps: 77000; episodes scores: [ -98.20918691 -101.75849036  -95.16157551  -98.82919176]; avg score: -98.48961113219382
Rendering episode 540.0 during training...
seed value:42
episode number sent to renderer:540.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_540.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_540.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_540.0.mp4
episode rendered
Episode 1/1 - Score: [-100.60033967]
learning timestep: 78000
Policy Loss: -0.15826795995235443
Value Loss: 0.6224267482757568
Entropy: -0.7968094348907471
KL Divergence: 0.948544979095459
episode: [547. 516. 488. 541.]; total steps: 78000; episodes scores: [-98.53574782 -98.74776462 -95.07063356 -98.86303789]; avg score: -97.80429597129363
Rendering episode 560.0 during training...
seed value:42
episode number sent to renderer:560.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_560.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_560.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_560.0.mp4
episode rendered
Episode 1/1 - Score: [-99.62223889]




episode: [563. 534. 490. 554.]; total steps: 79000; episodes scores: [-97.39654094 -98.43387608 -97.35378458 -97.12254901]; avg score: -97.57668765138571
Rendering episode 580.0 during training...
seed value:42
episode number sent to renderer:580.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_580.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_580.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_580.0.mp4
episode rendered
Episode 1/1 - Score: [-99.62223889]




learning timestep: 80000
Policy Loss: -0.12849266827106476
Value Loss: 0.9428891539573669
Entropy: -0.77274489402771
KL Divergence: 1.0987989902496338
episode: [580. 551. 508. 554.]; total steps: 80000; episodes scores: [-98.64905453 -99.05439919 -99.96921983 -97.12254901]; avg score: -98.69880563777296
episode: [595. 566. 524. 564.]; total steps: 81000; episodes scores: [-98.36057085 -96.6786411  -99.03292781 -98.78943019]; avg score: -98.2153924842955
Rendering episode 600.0 during training...
seed value:42
episode number sent to renderer:600.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_600.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_600.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_600.0.mp4
episode rendered
Episode 1/1 - Score: [-97.83939526]
learning timestep: 82000
Policy Loss: -0.1891508847475052
Value Loss: 0.46155351400375366
Entropy: -0.7556453347206116
KL Divergence: 0.7523630261421204
episode: [611. 584. 540. 579.]; total steps: 82000; episodes scores: [ -97.13601609 -100.26678652  -98.24931073  -94.38970764]; avg score: -97.51045524468505
Rendering episode 620.0 during training...
seed value:42
episode number sent to renderer:620.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_620.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_620.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_620.0.mp4
episode rendered
Episode 1/1 - Score: [-97.43787986]
episode: [627. 602. 557. 596.]; total steps: 83000; episodes scores: [-98.64205475 -98.87351372 -99.27207768 -98.00658655]; avg score: -98.6985581766412
Rendering episode 640.0 during training...
seed value:42
episode number sent to renderer:640.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_640.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_640.0.mp4



                                                             

Moviepy - Done !




Moviepy - video ready BipedalWalker/ppo/renders/train/episode_640.0.mp4
episode rendered
Episode 1/1 - Score: [-97.43787986]
learning timestep: 84000
Policy Loss: 0.03358345851302147
Value Loss: 0.3743833303451538
Entropy: -0.8779285550117493
KL Divergence: 0.9766037464141846
episode: [642. 615. 574. 611.]; total steps: 84000; episodes scores: [-98.9569698  -98.47388543 -97.10998468 -97.02983831]; avg score: -97.89266955428694
episode: [658. 631. 590. 627.]; total steps: 85000; episodes scores: [-98.16360156 -98.06991558 -98.74767193 -98.51788706]; avg score: -98.37476903118116
Rendering episode 660.0 during training...
seed value:42
episode number sent to renderer:660.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_660.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_660.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_660.0.mp4
episode rendered
Episode 1/1 - Score: [-97.55059938]
learning timestep: 86000
Policy Loss: -0.12674473226070404
Value Loss: 0.4632703959941864
Entropy: -0.8300106525421143
KL Divergence: 0.8831361532211304
episode: [674. 646. 606. 642.]; total steps: 86000; episodes scores: [ -97.1483432   -97.95156368  -99.50463255 -100.05688666]; avg score: -98.66535652335841
Rendering episode 680.0 during training...
seed value:42
episode number sent to renderer:680.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_680.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_680.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_680.0.mp4
episode rendered
Episode 1/1 - Score: [-97.98601508]




episode: [690. 663. 623. 659.]; total steps: 87000; episodes scores: [-99.10974688 -98.55088942 -99.49808454 -95.6529142 ]; avg score: -98.20290875922004
Rendering episode 700.0 during training...
seed value:42
episode number sent to renderer:700.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_700.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_700.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_700.0.mp4
episode rendered
Episode 1/1 - Score: [-97.98601508]
learning timestep: 88000
Policy Loss: 0.09150655567646027
Value Loss: 0.3213774561882019
Entropy: -1.023683786392212
KL Divergence: 1.3418463468551636
episode: [706. 667. 639. 677.]; total steps: 88000; episodes scores: [-98.58925942 -99.5273179  -99.05137104 -98.72944703]; avg score: -98.97434884607559
Rendering episode 720.0 during training...
seed value:42
episode number sent to renderer:720.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_720.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_720.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_720.0.mp4
episode rendered
Episode 1/1 - Score: [-97.2715636]
episode: [722. 671. 655. 693.]; total steps: 89000; episodes scores: [ -97.3765722   -96.03374975 -100.26478616  -99.24232461]; avg score: -98.22935817876163
learning timestep: 90000
Policy Loss: 0.04853493720293045
Value Loss: 0.4091383218765259
Entropy: -0.9250882863998413
KL Divergence: 1.1622803211212158
episode: [737. 686. 669. 709.]; total steps: 90000; episodes scores: [-95.58229359 -98.72837424 -93.61000647 -98.13129246]; avg score: -96.51299168862418
episode: [737. 701. 683. 725.]; total steps: 91000; episodes scores: [-95.58229359 -99.99316219 -97.63372723 -97.97951981]; avg score: -97.79717570454316
Rendering episode 740.0 during training...
seed value:42
episode number sent to renderer:740.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_740.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_740.0.mp4
episode rendered
Episode 1/1 - Score: [-96.97316005]
learning timestep: 92000
Policy Loss: -0.05542585998773575
Value Loss: 0.3925161361694336
Entropy: -1.0837022066116333
KL Divergence: 1.4181467294692993
episode: [744. 717. 699. 741.]; total steps: 92000; episodes scores: [-98.66242314 -98.96041517 -98.08780934 -96.82957597]; avg score: -98.1350559053972
episode: [745. 733. 714. 757.]; total steps: 93000; episodes scores: [-97.42094544 -97.97685467 -98.93250384 -95.90394078]; avg score: -97.55856118058568
learning timestep: 94000
Policy Loss: -0.17499864101409912
Value Loss: 0.23841024935245514
Entropy: -1.0857057571411133
KL Divergence: 1.3609949350357056
episode: [752. 748. 730. 773.]; total steps: 94000; episodes scores: [-97.98918016 -96.67257915 -96.72122796 -98.25704691]; avg score: -97.41000854564645
Rendering episode 760.0 during training...
seed value:42
episode number sent to renderer:7

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_760.0.mp4
episode rendered
Episode 1/1 - Score: [-97.02259817]
episode: [767. 762. 745. 789.]; total steps: 95000; episodes scores: [-96.75902391 -97.71812874 -99.11150014 -98.90132342]; avg score: -98.1224940507474
Rendering episode 780.0 during training...
seed value:42
episode number sent to renderer:780.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_780.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_780.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_780.0.mp4
episode rendered
Episode 1/1 - Score: [-97.02259817]
learning timestep: 96000
Policy Loss: -0.17875491082668304
Value Loss: 0.3529725968837738
Entropy: -0.9158082008361816
KL Divergence: 1.271852731704712
episode: [781. 778. 760. 805.]; total steps: 96000; episodes scores: [-94.99932726 -97.53854706 -96.4992644  -97.30347139]; avg score: -96.58515252662123
episode: [797. 793. 775. 821.]; total steps: 97000; episodes scores: [-98.96026299 -98.03642701 -96.74400846 -97.97352761]; avg score: -97.92855651689135
Rendering episode 800.0 during training...
seed value:42
episode number sent to renderer:800.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_800.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_800.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_800.0.mp4
episode rendered
Episode 1/1 - Score: [-96.75636189]
learning timestep: 98000
Policy Loss: 0.16041429340839386
Value Loss: 0.4734228253364563
Entropy: -1.0061044692993164
KL Divergence: 1.517354965209961
episode: [812. 807. 791. 836.]; total steps: 98000; episodes scores: [-97.77020754 -97.64935439 -98.19261566 -99.34846034]; avg score: -98.24015948280987
Rendering episode 820.0 during training...
seed value:42
episode number sent to renderer:820.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_820.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_820.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_820.0.mp4
episode rendered
Episode 1/1 - Score: [-97.71529719]
episode: [827. 823. 805. 850.]; total steps: 99000; episodes scores: [-99.12790338 -98.05283845 -97.80919815 -97.11608967]; avg score: -98.02650741089539
learning timestep: 100000
Policy Loss: -0.08612256497144699
Value Loss: 0.2965472340583801
Entropy: -1.0740771293640137
KL Divergence: 1.5980827808380127
episode: [836. 839. 818. 865.]; total steps: 100000; episodes scores: [-97.47220107 -97.17829452 -96.5037172  -95.96091644]; avg score: -96.7787823063261
episode: [836. 853. 818. 881.]; total steps: 101000; episodes scores: [-97.47220107 -97.49332371 -96.5037172  -94.98340043]; avg score: -96.6131606005809
Rendering episode 840.0 during training...
seed value:42
episode number sent to renderer:840.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_840.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_840.0.mp4
episode rendered
Episode 1/1 - Score: [-97.06352688]
learning timestep: 102000
Policy Loss: 0.17194603383541107
Value Loss: 0.172672837972641
Entropy: -1.1125617027282715
KL Divergence: 1.7180230617523193
episode: [850. 867. 827. 895.]; total steps: 102000; episodes scores: [-98.71638283 -96.62951839 -97.73060545 -94.28093812]; avg score: -96.83936119750467
episode: [850. 883. 842. 911.]; total steps: 103000; episodes scores: [-98.71638283 -98.6725789  -99.23933609 -98.54415783]; avg score: -98.79311391222771
learning timestep: 104000
Policy Loss: 0.1619456559419632
Value Loss: 0.7394609451293945
Entropy: -1.1859396696090698
KL Divergence: 1.5947152376174927
episode: [857. 899. 858. 926.]; total steps: 104000; episodes scores: [-98.54115938 -94.0388478  -94.89577168 -96.013755  ]; avg score: -95.87238346211177
Rendering episode 860.0 during training...
seed value:42
episode number sent to renderer:

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_860.0.mp4
episode rendered
Episode 1/1 - Score: [-97.29424024]
episode: [871. 914. 872. 931.]; total steps: 105000; episodes scores: [-95.47931441 -96.7568872  -95.99633431 -95.77656093]; avg score: -96.00227421228416
Rendering episode 880.0 during training...
seed value:42
episode number sent to renderer:880.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_880.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_880.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_880.0.mp4
episode rendered
Episode 1/1 - Score: [-97.29424024]




learning timestep: 106000
Policy Loss: -0.02312801405787468
Value Loss: 0.6654974222183228
Entropy: -1.1630983352661133
KL Divergence: 1.599853277206421
episode: [886. 928. 887. 933.]; total steps: 106000; episodes scores: [-96.37750435 -94.83610672 -97.96179358 -94.6927102 ]; avg score: -95.96702871046682
Rendering episode 900.0 during training...
seed value:42
episode number sent to renderer:900.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_900.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_900.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_900.0.mp4
episode rendered
Episode 1/1 - Score: [-97.01067005]
episode: [900. 944. 902. 948.]; total steps: 107000; episodes scores: [-98.42481794 -97.46717736 -97.35090415 -95.06731032]; avg score: -97.07755244382193
learning timestep: 108000
Policy Loss: -0.1184290200471878
Value Loss: 0.3581853210926056
Entropy: -1.0353894233703613
KL Divergence: 1.4346379041671753
episode: [914. 959. 917. 963.]; total steps: 108000; episodes scores: [-94.21083056 -97.91259337 -97.05750417 -96.03528397]; avg score: -96.30405301636644
Rendering episode 920.0 during training...
seed value:42
episode number sent to renderer:920.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_920.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_920.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_920.0.mp4
episode rendered
Episode 1/1 - Score: [-97.35375502]




episode: [928. 973. 931. 978.]; total steps: 109000; episodes scores: [-97.76666262 -99.05320992 -96.93258863 -94.28640901]; avg score: -97.00971754448906
Rendering episode 940.0 during training...
seed value:42
episode number sent to renderer:940.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_940.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_940.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_940.0.mp4
episode rendered
Episode 1/1 - Score: [-97.35375502]
learning timestep: 110000
Policy Loss: -0.2190115749835968
Value Loss: 0.36345791816711426
Entropy: -1.1858625411987305
KL Divergence: 1.551995038986206
episode: [942. 989. 946. 988.]; total steps: 110000; episodes scores: [ -97.2186777  -100.21984788  -97.52433563  -97.19900391]; avg score: -98.04046627725894
episode: [ 947. 1002.  961.  988.]; total steps: 111000; episodes scores: [-95.3508882  -97.78399762 -97.17132211 -97.19900391]; avg score: -96.87630296038309
learning timestep: 112000
Policy Loss: 0.01311310101300478
Value Loss: 0.10841189324855804
Entropy: -1.4512474536895752
KL Divergence: 1.9289895296096802
episode: [ 949. 1007.  964. 1000.]; total steps: 112000; episodes scores: [-96.72793354 -96.97927021 -96.08963248 -97.02860346]; avg score: -96.70635992242643
Rendering episode 960.0 during training...
seed value:42
episode number se

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_960.0.mp4
episode rendered
Episode 1/1 - Score: [-96.28652228]
episode: [ 962. 1009.  967. 1015.]; total steps: 113000; episodes scores: [-98.40639216 -94.85127379 -98.27446873 -93.47498467]; avg score: -96.25177983391657
learning timestep: 114000
Policy Loss: -0.0867079347372055
Value Loss: 0.34616321325302124
Entropy: -1.3435544967651367
KL Divergence: 2.2830801010131836
episode: [ 976. 1022.  982. 1029.]; total steps: 114000; episodes scores: [-96.43477702 -96.38208644 -97.3691728  -97.55399887]; avg score: -96.9350087829566
Rendering episode 980.0 during training...
seed value:42
episode number sent to renderer:980.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_980.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_980.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_980.0.mp4
episode rendered
Episode 1/1 - Score: [-94.58862161]
episode: [ 991. 1035.  985. 1042.]; total steps: 115000; episodes scores: [-97.07015528 -96.9360153  -95.29151832 -98.50883613]; avg score: -96.95163125509366
Rendering episode 1000.0 during training...
seed value:42
episode number sent to renderer:1000.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1000.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1000.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1000.0.mp4
episode rendered
Episode 1/1 - Score: [-94.58862161]
learning timestep: 116000
Policy Loss: 0.2140197604894638
Value Loss: 0.17856121063232422
Entropy: -1.3920485973358154
KL Divergence: 1.9094419479370117
episode: [1004. 1050.  989. 1056.]; total steps: 116000; episodes scores: [-92.4983825  -97.61385798 -96.55578669 -95.47868026]; avg score: -95.53667685762669
episode: [1014. 1062. 1001. 1069.]; total steps: 117000; episodes scores: [-95.82153124 -95.55959607 -98.01742165 -94.4709305 ]; avg score: -95.96736986195452
learning timestep: 118000
Policy Loss: 0.030512845143675804
Value Loss: 0.2907506823539734
Entropy: -1.4801528453826904
KL Divergence: 1.9766616821289062
episode: [1014. 1065. 1013. 1083.]; total steps: 118000; episodes scores: [-95.82153124 -93.94472858 -96.06149916 -96.55599186]; avg score: -95.5959377086819
episode: [1015. 1068. 1023. 1089.]; total steps: 119000; episodes scores: 

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1020.0.mp4
episode rendered
Episode 1/1 - Score: [-94.6844553]
learning timestep: 122000
Policy Loss: 0.07361957430839539
Value Loss: 0.16343732178211212
Entropy: -1.7461590766906738
KL Divergence: 1.7201364040374756
episode: [1022. 1086. 1046. 1107.]; total steps: 122000; episodes scores: [-95.84547475 -94.81621664 -95.25228474 -93.95823253]; avg score: -94.96805216300797
episode: [1034. 1097. 1052. 1107.]; total steps: 123000; episodes scores: [-95.39801567 -96.28052403 -93.27344114 -93.95823253]; avg score: -94.72755334295286
Rendering episode 1040.0 during training...
seed value:42
episode number sent to renderer:1040.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1040.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1040.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1040.0.mp4
episode rendered
Episode 1/1 - Score: [-93.69854371]
learning timestep: 124000
Policy Loss: -0.3329220712184906
Value Loss: 0.6264945268630981
Entropy: -1.4962297677993774
KL Divergence: 2.293818950653076
episode: [1047. 1110. 1065. 1117.]; total steps: 124000; episodes scores: [-95.35191413 -94.28450839 -96.79641542 -96.34598343]; avg score: -95.69470534296039
episode: [1059. 1121. 1077. 1128.]; total steps: 125000; episodes scores: [-93.69985701 -92.91077936 -95.10642803 -93.83697016]; avg score: -93.88850864253301
Rendering episode 1060.0 during training...
seed value:42
episode number sent to renderer:1060.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1060.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1060.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1060.0.mp4
episode rendered
Episode 1/1 - Score: [-94.80534712]
learning timestep: 126000
Policy Loss: 0.15901975333690643
Value Loss: 0.2523970603942871
Entropy: -1.5399198532104492
KL Divergence: 2.1139450073242188
episode: [1070. 1133. 1088. 1131.]; total steps: 126000; episodes scores: [-93.03747482 -94.77578521 -92.00374633 -93.5336013 ]; avg score: -93.33765191417545
episode: [1079. 1145. 1101. 1134.]; total steps: 127000; episodes scores: [-94.10654348 -93.20603946 -93.79410114 -91.19137372]; avg score: -93.07451445071322
learning timestep: 128000
Policy Loss: -0.03643391653895378
Value Loss: 0.24776291847229004
Entropy: -1.627225399017334
KL Divergence: 2.99267578125
episode: [1079. 1155. 1113. 1140.]; total steps: 128000; episodes scores: [-94.10654348 -93.36856332 -96.09627165 -93.3369057 ]; avg score: -94.22707103903375
Rendering episode 1080.0 during training...
seed value:42
episode number sent 

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1080.0.mp4
episode rendered
Episode 1/1 - Score: [-94.88836749]
episode: [1087. 1168. 1125. 1150.]; total steps: 129000; episodes scores: [-93.42144353 -95.63769555 -94.37671085 -95.64167227]; avg score: -94.76938054897622
Rendering episode 1100.0 during training...
seed value:42
episode number sent to renderer:1100.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1100.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1100.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1100.0.mp4
episode rendered
Episode 1/1 - Score: [-94.88836749]
learning timestep: 130000
Policy Loss: 0.06421556323766708
Value Loss: 0.41494572162628174
Entropy: -1.4316753149032593
KL Divergence: 2.1385042667388916
episode: [1100. 1180. 1137. 1162.]; total steps: 130000; episodes scores: [-94.11364777 -95.58922401 -94.03951685 -91.9401526 ]; avg score: -93.92063530940996
episode: [1110. 1191. 1149. 1174.]; total steps: 131000; episodes scores: [-91.06826333 -93.65969898 -91.4772201  -92.89795936]; avg score: -92.2757854436268
Rendering episode 1120.0 during training...
seed value:42
episode number sent to renderer:1120.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1120.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1120.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1120.0.mp4
episode rendered
Episode 1/1 - Score: [-93.88654653]
learning timestep: 132000
Policy Loss: 0.015653979033231735
Value Loss: 0.19060328602790833
Entropy: -1.423702359199524
KL Divergence: 1.9246312379837036
episode: [1120. 1202. 1161. 1185.]; total steps: 132000; episodes scores: [-93.28650912 -93.73817301 -95.20852908 -92.9724539 ]; avg score: -93.8014162780464
episode: [1132. 1213. 1172. 1197.]; total steps: 133000; episodes scores: [-93.60761422 -92.91395394 -93.92257986 -93.09064136]; avg score: -93.3836973442994
Rendering episode 1140.0 during training...
seed value:42
episode number sent to renderer:1140.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1140.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1140.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1140.0.mp4
episode rendered
Episode 1/1 - Score: [-93.31497049]
learning timestep: 134000
Policy Loss: -0.030061252415180206
Value Loss: 0.14007999002933502
Entropy: -1.5322859287261963
KL Divergence: 2.0939435958862305
episode: [1142. 1225. 1184. 1207.]; total steps: 134000; episodes scores: [-91.5213791  -92.40602564 -94.23052984 -90.71599478]; avg score: -92.21848233814212
episode: [1153. 1235. 1194. 1218.]; total steps: 135000; episodes scores: [-94.88091727 -93.49337364 -92.48808679 -91.76207285]; avg score: -93.15611263594714
Rendering episode 1160.0 during training...
seed value:42
episode number sent to renderer:1160.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1160.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1160.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1160.0.mp4
episode rendered
Episode 1/1 - Score: [-92.65317173]
learning timestep: 136000
Policy Loss: -0.056585393846035004
Value Loss: 0.16217274963855743
Entropy: -1.5552510023117065
KL Divergence: 2.547184705734253
episode: [1164. 1245. 1205. 1229.]; total steps: 136000; episodes scores: [-89.75244859 -91.928014   -93.70821027 -92.01491912]; avg score: -91.85089799439768
episode: [1175. 1257. 1216. 1240.]; total steps: 137000; episodes scores: [-92.91489379 -94.23035989 -91.72011057 -93.23046084]; avg score: -93.0239562704875
Rendering episode 1180.0 during training...
seed value:42
episode number sent to renderer:1180.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1180.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1180.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1180.0.mp4
episode rendered
Episode 1/1 - Score: [-92.41618193]




learning timestep: 138000
Policy Loss: 0.1240260973572731
Value Loss: 0.21507367491722107
Entropy: -1.572155475616455
KL Divergence: 2.2504916191101074
episode: [1185. 1267. 1227. 1251.]; total steps: 138000; episodes scores: [-90.70277646 -92.03960457 -93.21957446 -93.19397051]; avg score: -92.28898149996205
episode: [1195. 1278. 1239. 1263.]; total steps: 139000; episodes scores: [-91.62249244 -92.43501071 -91.15334061 -92.84428357]; avg score: -92.01378183353646
Rendering episode 1200.0 during training...
seed value:42
episode number sent to renderer:1200.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1200.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1200.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1200.0.mp4
episode rendered
Episode 1/1 - Score: [-92.7459626]
learning timestep: 140000
Policy Loss: -0.3118324875831604
Value Loss: 0.43141651153564453
Entropy: -1.5887150764465332
KL Divergence: 2.4194188117980957
episode: [1205. 1289. 1250. 1274.]; total steps: 140000; episodes scores: [-90.88891468 -91.25225299 -92.4360417  -94.05253925]; avg score: -92.15743715671326
episode: [1216. 1300. 1260. 1285.]; total steps: 141000; episodes scores: [-93.52935924 -92.13175587 -91.45956395 -94.12316623]; avg score: -92.81096132391488
Rendering episode 1220.0 during training...
seed value:42
episode number sent to renderer:1220.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1220.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1220.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1220.0.mp4
episode rendered
Episode 1/1 - Score: [-92.81503494]
learning timestep: 142000
Policy Loss: -0.03382577747106552
Value Loss: 0.11200544238090515
Entropy: -1.7262473106384277
KL Divergence: 2.8967223167419434
episode: [1226. 1311. 1271. 1295.]; total steps: 142000; episodes scores: [-92.05998414 -93.11105847 -88.25780781 -92.4306924 ]; avg score: -91.46488570485312
episode: [1236. 1322. 1282. 1306.]; total steps: 143000; episodes scores: [-89.28152979 -88.28788974 -90.92222378 -92.16317467]; avg score: -90.16370449621867
Rendering episode 1240.0 during training...
seed value:42
episode number sent to renderer:1240.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1240.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1240.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1240.0.mp4
episode rendered
Episode 1/1 - Score: [-92.5523784]
learning timestep: 144000
Policy Loss: -0.2537160813808441
Value Loss: 0.37213242053985596
Entropy: -1.6678167581558228
KL Divergence: 2.1280927658081055
episode: [1245. 1333. 1293. 1316.]; total steps: 144000; episodes scores: [-91.86959528 -91.68723424 -90.37283791 -91.91249879]; avg score: -91.46054155235723
episode: [1257. 1344. 1303. 1326.]; total steps: 145000; episodes scores: [-91.22232046 -92.29951459 -92.13860714 -88.5408389 ]; avg score: -91.05032027278808
Rendering episode 1260.0 during training...
seed value:42
episode number sent to renderer:1260.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1260.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1260.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1260.0.mp4
episode rendered
Episode 1/1 - Score: [-92.33869069]
learning timestep: 146000
Policy Loss: -0.03763780742883682
Value Loss: 0.169778972864151
Entropy: -1.8646137714385986
KL Divergence: 2.7821061611175537
episode: [1265. 1354. 1314. 1336.]; total steps: 146000; episodes scores: [-91.32867695 -91.54707523 -93.30975635 -92.53081058]; avg score: -92.17907977697439
episode: [1275. 1365. 1323. 1346.]; total steps: 147000; episodes scores: [-91.50389714 -93.71577538 -94.41465608 -91.05649504]; avg score: -92.67270591296271
Rendering episode 1280.0 during training...
seed value:42
episode number sent to renderer:1280.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1280.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1280.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1280.0.mp4
episode rendered
Episode 1/1 - Score: [-91.34696706]
learning timestep: 148000
Policy Loss: 0.05304597318172455
Value Loss: 0.2395435869693756
Entropy: -1.7148832082748413
KL Divergence: 2.808030128479004
episode: [1282. 1375. 1333. 1356.]; total steps: 148000; episodes scores: [-91.23314876 -89.09753401 -90.4037636  -92.18090587]; avg score: -90.7288380585614
episode: [1292. 1385. 1344. 1366.]; total steps: 149000; episodes scores: [-89.24175045 -89.39487276 -91.71824087 -91.66813302]; avg score: -90.50574927590895
Rendering episode 1300.0 during training...
seed value:42
episode number sent to renderer:1300.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1300.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1300.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1300.0.mp4
episode rendered
Episode 1/1 - Score: [-91.3269419]
learning timestep: 150000
Policy Loss: -0.27438610792160034
Value Loss: 0.27598756551742554
Entropy: -1.8507670164108276
KL Divergence: 2.187688112258911
episode: [1301. 1394. 1354. 1375.]; total steps: 150000; episodes scores: [-92.59482662 -90.94777094 -91.34649413 -90.76681957]; avg score: -91.41397781470138
episode: [1310. 1403. 1364. 1386.]; total steps: 151000; episodes scores: [-91.56586794 -92.70474667 -89.02582782 -92.3367623 ]; avg score: -91.40830118221572
learning timestep: 152000
Policy Loss: -0.022658338770270348
Value Loss: 0.23622839152812958
Entropy: -1.9777735471725464
KL Divergence: 2.7619993686676025
episode: [1319. 1413. 1373. 1395.]; total steps: 152000; episodes scores: [-89.57041767 -89.49620832 -93.83517433 -90.24418008]; avg score: -90.78649510307439
Rendering episode 1320.0 during training...
seed value:42
episode numbe

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1320.0.mp4
episode rendered
Episode 1/1 - Score: [-90.31173432]
episode: [1328. 1423. 1384. 1404.]; total steps: 153000; episodes scores: [-90.22701309 -89.59275838 -91.21229293 -91.17183322]; avg score: -90.5509744020022
learning timestep: 154000
Policy Loss: 0.3003113269805908
Value Loss: 0.1816623955965042
Entropy: -1.7947335243225098
KL Divergence: 2.4261655807495117
episode: [1336. 1433. 1393. 1414.]; total steps: 154000; episodes scores: [-90.93440184 -89.56013626 -90.90365195 -93.4022985 ]; avg score: -91.20012213731789
Rendering episode 1340.0 during training...
seed value:42
episode number sent to renderer:1340.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1340.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1340.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1340.0.mp4
episode rendered
Episode 1/1 - Score: [-91.66966802]
episode: [1345. 1442. 1403. 1423.]; total steps: 155000; episodes scores: [-89.368415   -90.94103079 -91.73522431 -92.23574144]; avg score: -91.07010288843125
learning timestep: 156000
Policy Loss: 0.16742554306983948
Value Loss: 0.23948554694652557
Entropy: -2.0612549781799316
KL Divergence: 2.8421034812927246
episode: [1355. 1449. 1413. 1433.]; total steps: 156000; episodes scores: [-91.18494743 -91.61651786 -93.42747067 -91.04546279]; avg score: -91.81859968785166
Rendering episode 1360.0 during training...
seed value:42
episode number sent to renderer:1360.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1360.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1360.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1360.0.mp4
episode rendered
Episode 1/1 - Score: [-89.61282807]
episode: [1363. 1459. 1422. 1442.]; total steps: 157000; episodes scores: [-89.94539321 -89.81956851 -93.26731322 -90.24257473]; avg score: -90.81871241891042
learning timestep: 158000
Policy Loss: 0.03535019978880882
Value Loss: 0.14957338571548462
Entropy: -1.907745361328125
KL Divergence: 3.4149246215820312
episode: [1373. 1468. 1433. 1452.]; total steps: 158000; episodes scores: [-91.16602617 -88.96740755 -88.95414258 -91.32644791]; avg score: -90.10350605323072
Rendering episode 1380.0 during training...
seed value:42
episode number sent to renderer:1380.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1380.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1380.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1380.0.mp4
episode rendered
Episode 1/1 - Score: [-90.54348401]
episode: [1382. 1477. 1443. 1461.]; total steps: 159000; episodes scores: [-92.81058313 -90.89607456 -92.12885449 -87.49507339]; avg score: -90.83264639304531
learning timestep: 160000
Policy Loss: -0.23280216753482819
Value Loss: 0.4425647556781769
Entropy: -1.8387885093688965
KL Divergence: 2.6568286418914795
episode: [1391. 1487. 1453. 1471.]; total steps: 160000; episodes scores: [-92.69298627 -93.21904916 -93.19845296 -90.51939614]; avg score: -92.40747113164483
Rendering episode 1400.0 during training...
seed value:42
episode number sent to renderer:1400.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1400.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1400.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1400.0.mp4
episode rendered
Episode 1/1 - Score: [-89.63368643]
episode: [1400. 1497. 1462. 1480.]; total steps: 161000; episodes scores: [-91.43719777 -89.18642514 -89.00246016 -91.33957634]; avg score: -90.24141485459218
learning timestep: 162000
Policy Loss: 0.03060150519013405
Value Loss: 0.2219718098640442
Entropy: -2.0926895141601562
KL Divergence: 3.4589056968688965
episode: [1410. 1505. 1471. 1489.]; total steps: 162000; episodes scores: [-91.61832629 -90.4042949  -89.88815977 -90.44087567]; avg score: -90.58791415917051
Rendering episode 1420.0 during training...
seed value:42
episode number sent to renderer:1420.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1420.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1420.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1420.0.mp4
episode rendered
Episode 1/1 - Score: [-89.76404931]
episode: [1420. 1515. 1480. 1499.]; total steps: 163000; episodes scores: [-92.27335652 -90.55900029 -91.46983607 -89.56264568]; avg score: -90.96620963948857
learning timestep: 164000
Policy Loss: -0.06488284468650818
Value Loss: 0.22501638531684875
Entropy: -1.890417456626892
KL Divergence: 2.7573297023773193
episode: [1429. 1525. 1490. 1508.]; total steps: 164000; episodes scores: [-90.89773115 -91.45299962 -89.57204769 -91.36053987]; avg score: -90.82082958364379
episode: [1438. 1534. 1500. 1518.]; total steps: 165000; episodes scores: [-90.05253467 -91.2322458  -90.93839551 -94.61104998]; avg score: -91.70855649154866
Rendering episode 1440.0 during training...
seed value:42
episode number sent to renderer:1440.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1440.0.mp4.
Moviepy - Writing video Bipedal

                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1440.0.mp4
episode rendered
Episode 1/1 - Score: [-90.86405673]
learning timestep: 166000
Policy Loss: 0.0414319783449173
Value Loss: 0.2279585301876068
Entropy: -2.121845245361328
KL Divergence: 3.487071990966797
episode: [1446. 1544. 1510. 1527.]; total steps: 166000; episodes scores: [-89.58933829 -91.43905238 -89.8588781  -92.32675049]; avg score: -90.80350481587509
episode: [1456. 1553. 1520. 1536.]; total steps: 167000; episodes scores: [-90.53767597 -90.20126599 -89.2830839  -90.14385988]; avg score: -90.0414714370612
Rendering episode 1460.0 during training...
seed value:42
episode number sent to renderer:1460.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1460.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1460.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1460.0.mp4
episode rendered
Episode 1/1 - Score: [-91.79702539]
learning timestep: 168000
Policy Loss: 0.15155787765979767
Value Loss: 0.11093145608901978
Entropy: -1.9067491292953491
KL Divergence: 3.1610469818115234
episode: [1465. 1562. 1529. 1546.]; total steps: 168000; episodes scores: [-91.07851082 -90.99389479 -88.61756753 -91.93655423]; avg score: -90.65663184211016
episode: [1473. 1571. 1538. 1554.]; total steps: 169000; episodes scores: [-92.29256179 -87.57357061 -90.37420019 -89.22779081]; avg score: -89.86703084753779
Rendering episode 1480.0 during training...
seed value:42
episode number sent to renderer:1480.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1480.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1480.0.mp4



                                                             

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1480.0.mp4
episode rendered
Episode 1/1 - Score: [-91.52701943]
learning timestep: 170000
Policy Loss: -0.036059606820344925
Value Loss: 0.13244962692260742
Entropy: -2.1215643882751465
KL Divergence: 2.9384350776672363
episode: [1482. 1580. 1548. 1564.]; total steps: 170000; episodes scores: [-90.90280425 -91.12280514 -91.13242366 -90.07228317]; avg score: -90.80757905588315
episode: [1491. 1588. 1557. 1573.]; total steps: 171000; episodes scores: [-92.63976319 -88.86594732 -90.14955016 -89.82456862]; avg score: -90.36995732297655
Rendering episode 1500.0 during training...
seed value:42
episode number sent to renderer:1500.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1500.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1500.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1500.0.mp4
episode rendered
Episode 1/1 - Score: [-91.18901482]
learning timestep: 172000
Policy Loss: 0.08800585567951202
Value Loss: 0.0915999710559845
Entropy: -2.0378565788269043
KL Divergence: 3.378136157989502
episode: [1500. 1598. 1566. 1582.]; total steps: 172000; episodes scores: [-90.64324273 -92.08875914 -88.58035463 -89.97711241]; avg score: -90.32236722751473
episode: [1509. 1607. 1575. 1591.]; total steps: 173000; episodes scores: [-89.60004474 -88.7228923  -91.66689028 -90.33155714]; avg score: -90.08034611635219
learning timestep: 174000
Policy Loss: -0.26254624128341675
Value Loss: 0.09844858944416046
Entropy: -2.1851017475128174
KL Divergence: 3.6747846603393555
episode: [1518. 1615. 1584. 1599.]; total steps: 174000; episodes scores: [-88.55234632 -90.48698542 -89.64053329 -90.92225857]; avg score: -89.90053089866181
Rendering episode 1520.0 during training...
seed value:42
episode number 

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1520.0.mp4
episode rendered
Episode 1/1 - Score: [-90.67457237]
episode: [1527. 1624. 1593. 1608.]; total steps: 175000; episodes scores: [-90.17320048 -91.33963626 -88.7153461  -89.7454066 ]; avg score: -89.99339735713056
learning timestep: 176000
Policy Loss: 0.13506032526493073
Value Loss: 0.13710734248161316
Entropy: -2.082366466522217
KL Divergence: 3.533259868621826
episode: [1536. 1632. 1601. 1617.]; total steps: 176000; episodes scores: [-91.64550939 -89.12082707 -86.9376235  -88.86193018]; avg score: -89.1414725341297
Rendering episode 1540.0 during training...
seed value:42
episode number sent to renderer:1540.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1540.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1540.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1540.0.mp4
episode rendered
Episode 1/1 - Score: [-90.17780862]
episode: [1544. 1641. 1610. 1625.]; total steps: 177000; episodes scores: [-91.57441806 -90.32177268 -88.04056221 -89.76548608]; avg score: -89.92555976020995
learning timestep: 178000
Policy Loss: -0.14530067145824432
Value Loss: 0.09285685420036316
Entropy: -2.2246155738830566
KL Divergence: 2.441056728363037
episode: [1552. 1649. 1618. 1633.]; total steps: 178000; episodes scores: [-90.20783791 -93.24992245 -91.70763008 -89.45202874]; avg score: -91.1543547954227
Rendering episode 1560.0 during training...
seed value:42
episode number sent to renderer:1560.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1560.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1560.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1560.0.mp4
episode rendered
Episode 1/1 - Score: [-89.04800797]
episode: [1561. 1658. 1627. 1642.]; total steps: 179000; episodes scores: [-90.21691794 -90.95539738 -89.03174858 -88.25055353]; avg score: -89.61365435761198
learning timestep: 180000
Policy Loss: 0.019546395167708397
Value Loss: 0.0843757763504982
Entropy: -2.237999439239502
KL Divergence: 3.3966946601867676
episode: [1569. 1666. 1635. 1650.]; total steps: 180000; episodes scores: [-87.67428339 -88.9180246  -88.86139563 -88.72510088]; avg score: -88.54470112720132
episode: [1578. 1674. 1644. 1659.]; total steps: 181000; episodes scores: [-90.25643545 -87.96290238 -91.2232186  -88.41739836]; avg score: -89.46498869584128
Rendering episode 1580.0 during training...
seed value:42
episode number sent to renderer:1580.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1580.0.mp4.
Moviepy - Writing video BipedalW

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1580.0.mp4
episode rendered
Episode 1/1 - Score: [-88.75631122]
learning timestep: 182000
Policy Loss: 0.17668266594409943
Value Loss: 0.10748858749866486
Entropy: -2.278834819793701
KL Divergence: 3.0338006019592285
episode: [1586. 1683. 1652. 1666.]; total steps: 182000; episodes scores: [-90.27324437 -87.33655731 -89.96355383 -86.81711236]; avg score: -88.59761696753776
episode: [1595. 1692. 1660. 1674.]; total steps: 183000; episodes scores: [-88.98469233 -89.99507461 -89.75870107 -89.77534833]; avg score: -89.62845408313876
Rendering episode 1600.0 during training...
seed value:42
episode number sent to renderer:1600.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1600.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1600.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1600.0.mp4
episode rendered
Episode 1/1 - Score: [-87.85328046]
learning timestep: 184000
Policy Loss: 0.009596526622772217
Value Loss: 0.08102262020111084
Entropy: -2.2020864486694336
KL Divergence: 4.499589920043945
episode: [1603. 1700. 1669. 1683.]; total steps: 184000; episodes scores: [-88.65486309 -87.23505323 -92.98064568 -89.12702575]; avg score: -89.49939693734298
episode: [1611. 1708. 1677. 1691.]; total steps: 185000; episodes scores: [-87.47542593 -89.50048838 -91.57660602 -88.67504326]; avg score: -89.30689089670327
learning timestep: 186000
Policy Loss: 0.05630632862448692
Value Loss: 0.08884970098733902
Entropy: -2.266240358352661
KL Divergence: 4.123047828674316
episode: [1619. 1717. 1686. 1699.]; total steps: 186000; episodes scores: [-89.34836785 -88.74514352 -89.79679759 -89.93114772]; avg score: -89.45536416899769
Rendering episode 1620.0 during training...
seed value:42
episode number s

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1620.0.mp4
episode rendered
Episode 1/1 - Score: [-87.89474994]
episode: [1627. 1725. 1694. 1707.]; total steps: 187000; episodes scores: [-88.53953643 -88.21625138 -86.99892605 -87.56900162]; avg score: -87.83092887001997
learning timestep: 188000
Policy Loss: -0.15220674872398376
Value Loss: 0.14867660403251648
Entropy: -2.3031814098358154
KL Divergence: 3.2404513359069824
episode: [1635. 1731. 1702. 1715.]; total steps: 188000; episodes scores: [-87.2547803  -91.6172639  -85.87057821 -93.16559137]; avg score: -89.4770534431017
Rendering episode 1640.0 during training...
seed value:42
episode number sent to renderer:1640.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1640.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1640.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1640.0.mp4
episode rendered
Episode 1/1 - Score: [-87.41239122]
episode: [1643. 1738. 1710. 1723.]; total steps: 189000; episodes scores: [-87.19959523 -89.75347084 -87.90269066 -88.25802601]; avg score: -88.2784456835414
learning timestep: 190000
Policy Loss: 0.22971466183662415
Value Loss: 0.11737552285194397
Entropy: -2.1295459270477295
KL Divergence: 4.142721176147461
episode: [1651. 1746. 1718. 1731.]; total steps: 190000; episodes scores: [-89.61961814 -89.15369826 -88.76517901 -88.93183912]; avg score: -89.11758363333222
episode: [1659. 1754. 1726. 1739.]; total steps: 191000; episodes scores: [-88.72852686 -89.22169266 -88.63323256 -87.92972499]; avg score: -88.62829426910821
Rendering episode 1660.0 during training...
seed value:42
episode number sent to renderer:1660.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1660.0.mp4.
Moviepy - Writing video BipedalWa

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1660.0.mp4
episode rendered
Episode 1/1 - Score: [-86.80277838]
learning timestep: 192000
Policy Loss: 0.05885729938745499
Value Loss: 0.11371715366840363
Entropy: -2.4399046897888184
KL Divergence: 3.4955360889434814
episode: [1666. 1762. 1735. 1746.]; total steps: 192000; episodes scores: [-89.1224079  -87.10128889 -89.49710952 -86.77547507]; avg score: -88.12407034580471
episode: [1674. 1771. 1743. 1754.]; total steps: 193000; episodes scores: [-89.14081295 -87.7865151  -88.06802213 -87.74412919]; avg score: -88.18486984121986
Rendering episode 1680.0 during training...
seed value:42
episode number sent to renderer:1680.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1680.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1680.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1680.0.mp4
episode rendered
Episode 1/1 - Score: [-87.79881914]
learning timestep: 194000
Policy Loss: -0.008327854797244072
Value Loss: 0.13979870080947876
Entropy: -2.2659687995910645
KL Divergence: 4.105343818664551
episode: [1683. 1779. 1751. 1762.]; total steps: 194000; episodes scores: [-88.51880761 -88.93636743 -87.73334902 -88.59556437]; avg score: -88.44602210929493
episode: [1691. 1787. 1759. 1771.]; total steps: 195000; episodes scores: [-87.4166066  -87.63266921 -87.07370231 -86.97467044]; avg score: -87.27441213977353
learning timestep: 196000
Policy Loss: -0.04028218239545822
Value Loss: 0.15257716178894043
Entropy: -2.229966640472412
KL Divergence: 3.052595376968384
episode: [1699. 1796. 1767. 1779.]; total steps: 196000; episodes scores: [-87.11934751 -88.32866142 -89.70131055 -88.49182541]; avg score: -88.41028622139866
Rendering episode 1700.0 during training...
seed value:42
episode number

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1700.0.mp4
episode rendered
Episode 1/1 - Score: [-87.67861218]
episode: [1708. 1804. 1775. 1788.]; total steps: 197000; episodes scores: [-85.62297999 -86.21826671 -88.3821147  -86.87849711]; avg score: -86.77546462638426
learning timestep: 198000
Policy Loss: -0.18723970651626587
Value Loss: 0.1606147289276123
Entropy: -2.3773655891418457
KL Divergence: 5.68685245513916
episode: [1716. 1812. 1784. 1796.]; total steps: 198000; episodes scores: [-86.65083926 -86.76649932 -88.32613058 -86.6433942 ]; avg score: -87.09671583968526
Rendering episode 1720.0 during training...
seed value:42
episode number sent to renderer:1720.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1720.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1720.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1720.0.mp4
episode rendered
Episode 1/1 - Score: [-88.13930921]
episode: [1724. 1821. 1792. 1804.]; total steps: 199000; episodes scores: [-87.3822614  -87.8962183  -84.9373918  -87.50273911]; avg score: -86.92965265258681
learning timestep: 200000
Policy Loss: 0.0205231960862875
Value Loss: 0.1384216845035553
Entropy: -2.3073086738586426
KL Divergence: 4.549289703369141
episode: [1733. 1829. 1801. 1813.]; total steps: 200000; episodes scores: [-88.61886295 -87.54934988 -90.12296363 -85.65760003]; avg score: -87.98719412460116
Rendering episode 1740.0 during training...
seed value:42
episode number sent to renderer:1740.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1740.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1740.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1740.0.mp4
episode rendered
Episode 1/1 - Score: [-87.46164608]
episode: [1741. 1836. 1809. 1821.]; total steps: 201000; episodes scores: [-87.98262842 -89.49882443 -88.60336853 -84.58468004]; avg score: -87.66737535457236
learning timestep: 202000
Policy Loss: 0.05515426769852638
Value Loss: 0.4950297474861145
Entropy: -2.2884037494659424
KL Divergence: 4.166984558105469
episode: [1750. 1844. 1817. 1830.]; total steps: 202000; episodes scores: [-84.98051055 -86.9986858  -87.70411157 -88.4405152 ]; avg score: -87.03095577748229
episode: [1759. 1852. 1825. 1838.]; total steps: 203000; episodes scores: [-88.11151386 -84.48696158 -87.43383413 -87.35333112]; avg score: -86.846410173115
Rendering episode 1760.0 during training...
seed value:42
episode number sent to renderer:1760.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1760.0.mp4.
Moviepy - Writing video BipedalWalk

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1760.0.mp4
episode rendered
Episode 1/1 - Score: [-87.78239764]
learning timestep: 204000
Policy Loss: -0.19492173194885254
Value Loss: 0.182217538356781
Entropy: -2.2769882678985596
KL Divergence: 3.7958154678344727
episode: [1767. 1859. 1833. 1845.]; total steps: 204000; episodes scores: [-87.32797195 -88.15315394 -87.64622443 -85.71910077]; avg score: -87.2116127714448
episode: [1775. 1867. 1839. 1854.]; total steps: 205000; episodes scores: [-86.72913847 -82.84131724 -85.61219147 -86.72691705]; avg score: -85.47739105841657
Rendering episode 1780.0 during training...
seed value:42
episode number sent to renderer:1780.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1780.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1780.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1780.0.mp4
episode rendered
Episode 1/1 - Score: [-87.83429251]
learning timestep: 206000
Policy Loss: 0.1588388979434967
Value Loss: 0.18245533108711243
Entropy: -2.5039358139038086
KL Divergence: 4.465455532073975
episode: [1783. 1875. 1847. 1861.]; total steps: 206000; episodes scores: [-86.9442521  -87.48902379 -85.69226696 -87.23328383]; avg score: -86.83970666782976
episode: [1792. 1883. 1855. 1869.]; total steps: 207000; episodes scores: [-85.44112372 -86.24697171 -85.59738901 -89.24797504]; avg score: -86.63336487092187
Rendering episode 1800.0 during training...
seed value:42
episode number sent to renderer:1800.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1800.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1800.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1800.0.mp4
episode rendered
Episode 1/1 - Score: [-87.07643271]
learning timestep: 208000


[34m[1mwandb[0m: Adding directory to artifact (./BipedalWalker/ppo)... 

Policy Loss: -0.025104381144046783
Value Loss: 0.10381957143545151
Entropy: -2.4306392669677734
KL Divergence: 5.716612815856934


Done. 0.2s


episode: [1800. 1891. 1863. 1877.]; total steps: 208000; episodes scores: [-84.72897337 -86.68076488 -85.27816165 -86.8916765 ]; avg score: -85.89489409919238
episode: [1809. 1899. 1872. 1885.]; total steps: 209000; episodes scores: [-86.51005155 -86.85378671 -88.06490059 -86.31853108]; avg score: -86.93681748308241
learning timestep: 210000
Policy Loss: 0.09919518232345581
Value Loss: 0.08014434576034546
Entropy: -2.433913230895996
KL Divergence: 4.004668235778809
episode: [1817. 1908. 1881. 1894.]; total steps: 210000; episodes scores: [-87.96382054 -85.99354514 -88.70649939 -85.60682282]; avg score: -87.06767197238499
Rendering episode 1820.0 during training...
seed value:42
episode number sent to renderer:1820.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1820.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1820.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1820.0.mp4
episode rendered
Episode 1/1 - Score: [-87.62428155]
episode: [1825. 1916. 1889. 1902.]; total steps: 211000; episodes scores: [-84.45529367 -85.06321066 -88.28270759 -85.68187398]; avg score: -85.87077147675025
learning timestep: 212000
Policy Loss: -0.04980874061584473
Value Loss: 0.1931457221508026
Entropy: -2.4598402976989746
KL Divergence: 4.146085739135742
episode: [1833. 1924. 1896. 1911.]; total steps: 212000; episodes scores: [-86.83463211 -85.09237709 -89.84502622 -85.59827684]; avg score: -86.8425780638388
Rendering episode 1840.0 during training...
seed value:42
episode number sent to renderer:1840.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1840.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1840.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1840.0.mp4
episode rendered
Episode 1/1 - Score: [-87.20340913]
episode: [1842. 1932. 1905. 1919.]; total steps: 213000; episodes scores: [-86.92388502 -84.46269716 -85.089787   -86.75600365]; avg score: -85.80809320913022
learning timestep: 214000
Policy Loss: -0.06661473214626312
Value Loss: 0.11232130229473114
Entropy: -2.335789918899536
KL Divergence: 5.581724166870117
episode: [1851. 1940. 1913. 1927.]; total steps: 214000; episodes scores: [-85.6760889  -85.50928809 -87.44191719 -85.32904794]; avg score: -85.98908552998257
episode: [1859. 1948. 1921. 1934.]; total steps: 215000; episodes scores: [-84.02512251 -85.30956812 -86.18248702 -85.46535491]; avg score: -85.24563313989543
Rendering episode 1860.0 during training...
seed value:42
episode number sent to renderer:1860.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1860.0.mp4.
Moviepy - Writing video BipedalW

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1860.0.mp4
episode rendered
Episode 1/1 - Score: [-86.45368046]
learning timestep: 216000
Policy Loss: -0.010346447117626667
Value Loss: 0.10071081668138504
Entropy: -2.2984766960144043
KL Divergence: 5.641991138458252
episode: [1867. 1956. 1929. 1943.]; total steps: 216000; episodes scores: [-84.98019615 -85.1751364  -87.9473916  -84.80062534]; avg score: -85.72583737254081
episode: [1876. 1964. 1937. 1951.]; total steps: 217000; episodes scores: [-83.42928117 -88.76775016 -86.01736679 -83.75130305]; avg score: -85.49142529353935
Rendering episode 1880.0 during training...
seed value:42
episode number sent to renderer:1880.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1880.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1880.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1880.0.mp4
episode rendered
Episode 1/1 - Score: [-86.96940584]
learning timestep: 218000
Policy Loss: -0.09837866574525833
Value Loss: 0.40227800607681274
Entropy: -2.5510735511779785
KL Divergence: 5.171655654907227
episode: [1884. 1972. 1946. 1959.]; total steps: 218000; episodes scores: [-86.29037709 -84.66140692 -83.71210513 -87.05878997]; avg score: -85.43066977756827
episode: [1892. 1980. 1954. 1967.]; total steps: 219000; episodes scores: [-85.42864777 -87.80099613 -87.4822887  -84.73920932]; avg score: -86.36278547994527
Rendering episode 1900.0 during training...
seed value:42
episode number sent to renderer:1900.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1900.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1900.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1900.0.mp4
episode rendered
Episode 1/1 - Score: [-86.69503065]
learning timestep: 220000
Policy Loss: -0.013868795707821846
Value Loss: 0.3284358084201813
Entropy: -2.5471725463867188
KL Divergence: 5.230093002319336
episode: [1900. 1989. 1962. 1975.]; total steps: 220000; episodes scores: [-85.54831654 -86.70188575 -84.18596431 -83.82512684]; avg score: -85.0653233616234
episode: [1908. 1997. 1970. 1983.]; total steps: 221000; episodes scores: [-79.6166967  -86.73248201 -84.77341532 -85.65705223]; avg score: -84.19491156625514
learning timestep: 222000
Policy Loss: -0.1005319282412529
Value Loss: 0.17196916043758392
Entropy: -2.641178607940674
KL Divergence: 6.4560770988464355
episode: [1916. 2005. 1979. 1991.]; total steps: 222000; episodes scores: [-85.64731674 -86.70181085 -86.20862486 -85.05472996]; avg score: -85.90312060306562
Rendering episode 1920.0 during training...
seed value:42
episode number s

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1920.0.mp4
episode rendered
Episode 1/1 - Score: [-85.0595364]
episode: [1924. 2013. 1987. 1998.]; total steps: 223000; episodes scores: [-85.38909995 -82.65112783 -81.90300306 -85.30207232]; avg score: -83.81132578881815
learning timestep: 224000
Policy Loss: 0.2602660357952118
Value Loss: 0.8497000932693481
Entropy: -2.567755699157715
KL Divergence: 6.4508280754089355
episode: [1933. 2021. 1995. 2007.]; total steps: 224000; episodes scores: [-87.96178686 -86.85236753 -83.17232831 -87.4491004 ]; avg score: -86.35889577402702
Rendering episode 1940.0 during training...
seed value:42
episode number sent to renderer:1940.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1940.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1940.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1940.0.mp4
episode rendered
Episode 1/1 - Score: [-87.1609513]
episode: [1941. 2029. 2003. 2014.]; total steps: 225000; episodes scores: [-85.58028624 -79.69913018 -92.34315752 -84.44462703]; avg score: -85.5168002433887
learning timestep: 226000
Policy Loss: 0.1258777529001236
Value Loss: 0.2444586157798767
Entropy: -2.3150410652160645
KL Divergence: 4.559828758239746
episode: [1949. 2037. 2010. 2023.]; total steps: 226000; episodes scores: [-85.47423569 -86.07536067 -81.81593018 -86.0441966 ]; avg score: -84.8524307876327
episode: [1957. 2046. 2018. 2031.]; total steps: 227000; episodes scores: [-84.32999307 -83.14945315 -86.64070116 -81.72674932]; avg score: -83.96172417574826
Rendering episode 1960.0 during training...
seed value:42
episode number sent to renderer:1960.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1960.0.mp4.
Moviepy - Writing video BipedalWalker

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1960.0.mp4
episode rendered
Episode 1/1 - Score: [-87.52888646]
learning timestep: 228000
Policy Loss: -0.10488192737102509
Value Loss: 0.16952168941497803
Entropy: -2.6574790477752686
KL Divergence: 6.873276710510254
episode: [1965. 2053. 2026. 2038.]; total steps: 228000; episodes scores: [-83.92540258 -86.84759176 -85.57344202 -83.90623952]; avg score: -85.063168969094
episode: [1972. 2061. 2034. 2046.]; total steps: 229000; episodes scores: [-82.44209366 -76.19450358 -85.84780984 -82.13086558]; avg score: -81.65381816599674
Rendering episode 1980.0 during training...
seed value:42
episode number sent to renderer:1980.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_1980.0.mp4.
Moviepy - Writing video BipedalWalker/ppo/renders/train/episode_1980.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_1980.0.mp4
episode rendered
Episode 1/1 - Score: [-85.79605022]
learning timestep: 230000
Policy Loss: 0.04907088354229927
Value Loss: 0.26908957958221436
Entropy: -2.446119546890259
KL Divergence: 5.65444278717041
episode: [1980. 2069. 2042. 2054.]; total steps: 230000; episodes scores: [ -82.10667777 -116.57335228  -81.54424554  -84.78288901]; avg score: -91.25179115070665
episode: [1988. 2076. 2049. 2062.]; total steps: 231000; episodes scores: [-87.14961723 -78.30226096 -82.2238584  -83.36641668]; avg score: -82.76053831927214
learning timestep: 232000
Policy Loss: -0.14350809156894684
Value Loss: 0.37581539154052734
Entropy: -2.6667768955230713
KL Divergence: 6.161645412445068
episode: [1996. 2083. 2057. 2070.]; total steps: 232000; episodes scores: [-80.46933409 -79.09891885 -86.19236232 -83.02937993]; avg score: -82.1974987965344
Rendering episode 2000.0 during training...
seed value:42
episode number

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2000.0.mp4
episode rendered
Episode 1/1 - Score: [-86.46375048]
episode: [2003. 2091. 2064. 2077.]; total steps: 233000; episodes scores: [-75.30763061 -78.86208278 -84.52617382 -82.33118699]; avg score: -80.2567685494513
learning timestep: 234000
Policy Loss: -0.03459348902106285
Value Loss: 0.7149350643157959
Entropy: -2.7107787132263184
KL Divergence: 6.768614768981934
episode: [2010. 2099. 2071. 2084.]; total steps: 234000; episodes scores: [ -83.95673469  -84.40569397 -106.0932127   -81.64708517]; avg score: -89.0256816317076
episode: [2017. 2106. 2079. 2090.]; total steps: 235000; episodes scores: [-76.26358224 -79.62405733 -80.4880265  -78.53512942]; avg score: -78.72769887148854
Rendering episode 2020.0 during training...
seed value:42
episode number sent to renderer:2020.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_2020.0.mp4.
Moviepy - Writing video Bipeda

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2020.0.mp4
episode rendered
Episode 1/1 - Score: [-86.35390915]
learning timestep: 236000
Policy Loss: -0.17999665439128876
Value Loss: 0.41885748505592346
Entropy: -2.6778719425201416
KL Divergence: 6.14491081237793
episode: [2025. 2114. 2086. 2097.]; total steps: 236000; episodes scores: [-79.05253983 -80.98395775 -81.77668411 -81.06834106]; avg score: -80.72038068704043
episode: [2031. 2121. 2092. 2104.]; total steps: 237000; episodes scores: [-81.58905036 -82.82760283 -78.34609208 -77.01342761]; avg score: -79.94404322098707
learning timestep: 238000
Policy Loss: 0.09275289624929428
Value Loss: 1.181370735168457
Entropy: -2.6906967163085938
KL Divergence: 7.539802551269531
episode: [2038. 2128. 2099. 2112.]; total steps: 238000; episodes scores: [ -80.33924298 -104.50237654 -108.18830123 -104.09599909]; avg score: -99.28147996067318
Rendering episode 2040.0 during training...
seed value:42
episode number

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2040.0.mp4
episode rendered
Episode 1/1 - Score: [-80.20947408]
episode: [2044. 2135. 2106. 2118.]; total steps: 239000; episodes scores: [-72.07224154 -86.25023791 -72.16204696 -79.29065309]; avg score: -77.44379487268006
learning timestep: 240000
Policy Loss: -0.07175102084875107
Value Loss: 0.4593621492385864
Entropy: -2.58217191696167
KL Divergence: 6.4634175300598145
episode: [2051. 2142. 2112. 2125.]; total steps: 240000; episodes scores: [-82.08803639 -85.32475903 -86.8113769  -70.13248624]; avg score: -81.0891646378475
episode: [2057. 2149. 2119. 2132.]; total steps: 241000; episodes scores: [-73.48354492 -78.28990466 -84.8198695  -80.98243752]; avg score: -79.39393914911756
Rendering episode 2060.0 during training...
seed value:42
episode number sent to renderer:2060.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_2060.0.mp4.
Moviepy - Writing video BipedalWal

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2060.0.mp4
episode rendered
Episode 1/1 - Score: [-81.86164154]
learning timestep: 242000
Policy Loss: 0.06055482476949692
Value Loss: 0.36141443252563477
Entropy: -2.6530022621154785
KL Divergence: 6.097825050354004
episode: [2064. 2154. 2125. 2138.]; total steps: 242000; episodes scores: [-76.78710715 -77.02939112 -70.38555956 -78.31737536]; avg score: -75.62985829801485
episode: [2071. 2160. 2132. 2144.]; total steps: 243000; episodes scores: [-83.52779629 -80.99436087 -79.03037033 -73.43114872]; avg score: -79.24591905214577
learning timestep: 244000
Policy Loss: 0.10906393080949783
Value Loss: 2.1989078521728516
Entropy: -2.60471773147583
KL Divergence: 7.512774467468262
episode: [2078. 2166. 2138. 2152.]; total steps: 244000; episodes scores: [-77.25586217 -82.10373407 -66.50185683 -75.27423211]; avg score: -75.28392129633747
Rendering episode 2080.0 during training...
seed value:42
episode number sent

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2080.0.mp4
episode rendered
Episode 1/1 - Score: [-83.32397516]
episode: [2085. 2172. 2144. 2158.]; total steps: 245000; episodes scores: [-89.57435937 -77.16634288 -76.46293405 -81.06105007]; avg score: -81.06617159196409
learning timestep: 246000
Policy Loss: 0.13545185327529907
Value Loss: 1.5395252704620361
Entropy: -2.7422289848327637
KL Divergence: 8.225635528564453
episode: [2091. 2179. 2150. 2164.]; total steps: 246000; episodes scores: [-84.10337739 -71.72618096 -74.76354459 -72.87727064]; avg score: -75.86759339688174
episode: [2097. 2185. 2156. 2169.]; total steps: 247000; episodes scores: [-65.20425332 -84.94455204 -67.28287559 -61.39406375]; avg score: -69.70643617777573
Rendering episode 2100.0 during training...
seed value:42
episode number sent to renderer:2100.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_2100.0.mp4.
Moviepy - Writing video BipedalWa

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2100.0.mp4
episode rendered
Episode 1/1 - Score: [-77.4373811]
learning timestep: 248000
Policy Loss: -0.025390323251485825
Value Loss: 2.0203404426574707
Entropy: -2.7730283737182617
KL Divergence: 6.723309516906738
episode: [2105. 2191. 2163. 2174.]; total steps: 248000; episodes scores: [-105.49903668  -77.6066071   -82.99537876  -84.80481916]; avg score: -87.72646042593678
episode: [2111. 2197. 2170. 2180.]; total steps: 249000; episodes scores: [-49.11415782 -81.80673256 -81.86593707 -72.71624938]; avg score: -71.37576920982683
learning timestep: 250000
Policy Loss: 0.0030745433177798986
Value Loss: 0.712912380695343
Entropy: -2.8624401092529297
KL Divergence: 7.7985029220581055
episode: [2117. 2201. 2176. 2186.]; total steps: 250000; episodes scores: [ -74.19266712  -49.55145019 -107.04632641  -79.59403725]; avg score: -77.59612024409401
Rendering episode 2120.0 during training...
seed value:42
episode

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2120.0.mp4
episode rendered
Episode 1/1 - Score: [-54.44462339]
episode: [2123. 2207. 2182. 2192.]; total steps: 251000; episodes scores: [-82.00842152 -77.19840243 -89.22487481 -88.96218197]; avg score: -84.34847018291583
learning timestep: 252000
Policy Loss: -0.031068839132785797
Value Loss: 1.3663667440414429
Entropy: -2.8494114875793457
KL Divergence: 8.210768699645996
episode: [2129. 2212. 2186. 2197.]; total steps: 252000; episodes scores: [-81.99862876 -62.10619751 -77.43127965 -86.46947312]; avg score: -77.00139476277786
episode: [2134. 2217. 2191. 2201.]; total steps: 253000; episodes scores: [-65.27811111 -75.3630496  -87.16493629 -76.85464141]; avg score: -76.1651846028793
learning timestep: 254000
Policy Loss: -0.023702913895249367
Value Loss: 0.6205655336380005
Entropy: -2.8929836750030518
KL Divergence: 8.738736152648926
episode: [2139. 2219. 2198. 2205.]; total steps: 254000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2140.0.mp4
episode rendered
Episode 1/1 - Score: [-66.24631928]
episode: [2146. 2224. 2203. 2210.]; total steps: 255000; episodes scores: [-80.33294884 -81.20066215 -57.90393383 -64.68052674]; avg score: -71.02951789101974
learning timestep: 256000
Policy Loss: -0.11751396954059601
Value Loss: 0.44945278763771057
Entropy: -2.58590030670166
KL Divergence: 9.119956970214844
episode: [2150. 2229. 2209. 2216.]; total steps: 256000; episodes scores: [-84.16546147 -50.51944671 -70.20867106 -64.27377068]; avg score: -67.29183747824561
episode: [2154. 2235. 2215. 2220.]; total steps: 257000; episodes scores: [-35.01211846 -83.26157204 -71.75924062 -74.7566256 ]; avg score: -66.1973891806292
Rendering episode 2160.0 during training...
seed value:42
episode number sent to renderer:2160.0
rendering episode...
Moviepy - Building video BipedalWalker/ppo/renders/train/episode_2160.0.mp4.
Moviepy - Writing video BipedalWal

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2160.0.mp4
episode rendered
Episode 1/1 - Score: [-71.11015636]
learning timestep: 258000
Policy Loss: -0.06671522557735443
Value Loss: 1.0436848402023315
Entropy: -2.9495341777801514
KL Divergence: 10.629786491394043
episode: [2160. 2240. 2221. 2226.]; total steps: 258000; episodes scores: [-76.73947999 -80.23866471 -53.71648879 -78.07497226]; avg score: -72.19240143618315
episode: [2165. 2244. 2226. 2231.]; total steps: 259000; episodes scores: [-54.20618672 -58.62975948 -78.45761613 -61.88982909]; avg score: -63.29584785549977
learning timestep: 260000
Policy Loss: 0.27409154176712036
Value Loss: 1.3139903545379639
Entropy: -2.8295464515686035
KL Divergence: 7.754397392272949
episode: [2171. 2246. 2230. 2238.]; total steps: 260000; episodes scores: [-38.01994341 -30.94112653 -42.66819013 -73.54859516]; avg score: -46.29446380862251
episode: [2176. 2250. 2234. 2244.]; total steps: 261000; episodes scores: 

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2180.0.mp4
episode rendered
Episode 1/1 - Score: [-83.21067969]
learning timestep: 262000
Policy Loss: -0.26547059416770935
Value Loss: 0.9079775810241699
Entropy: -2.9657816886901855
KL Divergence: 9.039329528808594
episode: [2180. 2257. 2237. 2248.]; total steps: 262000; episodes scores: [-74.15604601 -80.43165558 -56.99862306 -74.22337474]; avg score: -71.45242484696783
episode: [2185. 2263. 2242. 2254.]; total steps: 263000; episodes scores: [ -51.51922503  -37.22744694 -102.98634782  -86.50948992]; avg score: -69.5606274278611
learning timestep: 264000
Policy Loss: 0.1977916657924652
Value Loss: 1.0232291221618652
Entropy: -2.851471424102783
KL Divergence: 8.8829927444458
episode: [2191. 2268. 2249. 2260.]; total steps: 264000; episodes scores: [-64.55639943 -64.26448012 -86.48275285 -73.7960952 ]; avg score: -72.27493189941316
episode: [2195. 2272. 2255. 2265.]; total steps: 265000; episodes scores: [-

                                                              

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2200.0.mp4
episode rendered
Episode 1/1 - Score: [-71.80451277]
learning timestep: 266000
Policy Loss: -0.11672326177358627
Value Loss: 0.7670583724975586
Entropy: -2.9569177627563477
KL Divergence: 8.715373992919922
episode: [2200. 2278. 2258. 2270.]; total steps: 266000; episodes scores: [-73.45575878 -74.88495008 -34.12234324 -62.79482917]; avg score: -61.31447031674472
episode: [2206. 2282. 2262. 2276.]; total steps: 267000; episodes scores: [-56.55112917 -75.14667498 -69.63457593 -64.27158126]; avg score: -66.40099033426043
learning timestep: 268000
Policy Loss: -0.03019961155951023
Value Loss: 1.0317671298980713
Entropy: -2.978480100631714
KL Divergence: 8.456792831420898
episode: [2213. 2287. 2265. 2280.]; total steps: 268000; episodes scores: [-87.02449519 -52.96141996 -52.9265794  -41.00313673]; avg score: -58.478907821446974
episode: [2218. 2290. 2269. 2287.]; total steps: 269000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2220.0.mp4
episode rendered
Episode 1/1 - Score: [-57.69141554]
learning timestep: 270000
Policy Loss: -0.2962586283683777
Value Loss: 1.1301590204238892
Entropy: -2.9765303134918213
KL Divergence: 10.78464412689209
episode: [2223. 2293. 2274. 2291.]; total steps: 270000; episodes scores: [-65.07518964 -66.16771533 -78.96921472 -31.82873984]; avg score: -60.510214883987025
episode: [2228. 2298. 2281. 2297.]; total steps: 271000; episodes scores: [-72.13442626 -35.76964823 -75.53750296 -70.06183274]; avg score: -63.37585254786225
learning timestep: 272000
Policy Loss: -0.07935737818479538
Value Loss: 0.9903631210327148
Entropy: -3.067530632019043
KL Divergence: 10.675592422485352
episode: [2233. 2304. 2286. 2303.]; total steps: 272000; episodes scores: [-40.37361694 -79.04920615 -74.17245622 -87.74626814]; avg score: -70.33538686385688
episode: [2239. 2307. 2290. 2308.]; total steps: 273000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2240.0.mp4
episode rendered
Episode 1/1 - Score: [-84.13937301]
learning timestep: 274000
Policy Loss: -0.05996377021074295
Value Loss: 1.1033401489257812
Entropy: -3.019636631011963
KL Divergence: 9.725693702697754
episode: [2242. 2310. 2294. 2313.]; total steps: 274000; episodes scores: [-68.79586773 -63.47271558 -35.77785639 -72.96990821]; avg score: -60.25408697736622
episode: [2246. 2315. 2298. 2317.]; total steps: 275000; episodes scores: [-68.69813407 -71.12011877 -86.17936211 -65.8548715 ]; avg score: -72.96312161191273
learning timestep: 276000
Policy Loss: 0.09433932602405548
Value Loss: 0.9960113167762756
Entropy: -2.9690375328063965
KL Divergence: 8.405643463134766
episode: [2253. 2319. 2302. 2320.]; total steps: 276000; episodes scores: [-70.89533613 -60.31547573 -52.00761082 -26.18025596]; avg score: -52.34966966398463
episode: [2257. 2323. 2307. 2323.]; total steps: 277000; episodes scores: [-

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2260.0.mp4
episode rendered
Episode 1/1 - Score: [-67.68849972]
learning timestep: 278000
Policy Loss: -0.017837896943092346
Value Loss: 0.8031799793243408
Entropy: -3.0063531398773193
KL Divergence: 8.443724632263184
episode: [2260. 2327. 2309. 2329.]; total steps: 278000; episodes scores: [-53.35832447 -61.63731659  19.85596069 -68.58641379]; avg score: -40.93152354128725
episode: [2264. 2330. 2312. 2333.]; total steps: 279000; episodes scores: [-28.40332569 -26.38619798 -54.14637602 -57.79700383]; avg score: -41.68322588086983
learning timestep: 280000
Policy Loss: 0.12857100367546082
Value Loss: 1.201434850692749
Entropy: -3.216897487640381
KL Divergence: 10.305622100830078
episode: [2268. 2333. 2317. 2338.]; total steps: 280000; episodes scores: [-51.43737161 -13.68970817 -64.6357489  -86.67913671]; avg score: -54.11049134809119
episode: [2271. 2336. 2320. 2341.]; total steps: 281000; episodes scores: [

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2280.0.mp4
episode rendered
Episode 1/1 - Score: [-67.78156971]
episode: [2280. 2343. 2325. 2349.]; total steps: 283000; episodes scores: [-63.558221   -49.39928086 -72.44723449 -34.08363394]; avg score: -54.87209257423366
learning timestep: 284000
Policy Loss: -0.22138208150863647
Value Loss: 4.5518646240234375
Entropy: -3.0291082859039307
KL Divergence: 10.090234756469727
episode: [2284. 2346. 2328. 2354.]; total steps: 284000; episodes scores: [-61.05413992 -68.90158054 -30.22986127 -50.34986386]; avg score: -52.63386139866927
episode: [2288. 2349. 2333. 2360.]; total steps: 285000; episodes scores: [-56.53779605  -0.33647594 -20.6301005  -74.67078091]; avg score: -38.04378835216133
learning timestep: 286000
Policy Loss: 0.0009856661781668663
Value Loss: 1.4416344165802002
Entropy: -3.1171467304229736
KL Divergence: 10.8441743850708
episode: [2292. 2352. 2338. 2364.]; total steps: 286000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2300.0.mp4
episode rendered
Episode 1/1 - Score: [-55.77588606]
learning timestep: 288000
Policy Loss: 0.04705946147441864
Value Loss: 0.7104421854019165
Entropy: -2.9431638717651367
KL Divergence: 10.186071395874023
episode: [2302. 2361. 2345. 2374.]; total steps: 288000; episodes scores: [-70.78480941 -50.86448476 -31.06956742 -61.48313268]; avg score: -53.55049856776621
episode: [2306. 2366. 2351. 2378.]; total steps: 289000; episodes scores: [-58.56867388 -59.74998355 -79.48039064 -70.96573674]; avg score: -67.19119620189768
learning timestep: 290000
Policy Loss: -0.022963210940361023
Value Loss: 1.4024338722229004
Entropy: -3.047243595123291
KL Divergence: 9.14102554321289
episode: [2309. 2370. 2355. 2380.]; total steps: 290000; episodes scores: [-20.40857589 -36.43268533 -33.27523122 -74.75869033]; avg score: -41.218795693197464
episode: [2312. 2373. 2359. 2384.]; total steps: 291000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2320.0.mp4
episode rendered
Episode 1/1 - Score: [-33.7148135]
episode: [2320. 2380. 2366. 2392.]; total steps: 293000; episodes scores: [-75.77647041   7.0687765  -36.42009233 -57.16005732]; avg score: -40.571960889682686
learning timestep: 294000
Policy Loss: -0.21538479626178741
Value Loss: 1.0147801637649536
Entropy: -3.060147762298584
KL Divergence: 11.383108139038086
episode: [2323. 2384. 2370. 2395.]; total steps: 294000; episodes scores: [-43.43039565 -58.16092634 -69.51782476 -24.19518744]; avg score: -48.8260835494753
episode: [2327. 2388. 2373. 2397.]; total steps: 295000; episodes scores: [-75.43346479 -70.22060093 -20.06781846 -58.74749506]; avg score: -56.117344812644646
learning timestep: 296000
Policy Loss: 0.02000381238758564
Value Loss: 1.3270727396011353
Entropy: -3.2150235176086426
KL Divergence: 12.298633575439453
episode: [2332. 2391. 2376. 2400.]; total steps: 296000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2340.0.mp4
episode rendered
Episode 1/1 - Score: [-52.03243863]
learning timestep: 298000
Policy Loss: 0.21696186065673828
Value Loss: 0.9916747808456421
Entropy: -3.1216256618499756
KL Divergence: 12.812141418457031
episode: [2342. 2401. 2383. 2410.]; total steps: 298000; episodes scores: [-55.20651856 -82.54633718 -62.92000929 -46.43215409]; avg score: -61.77625477604832
episode: [2347. 2406. 2387. 2415.]; total steps: 299000; episodes scores: [-77.6378335  -44.62105636 -71.73998102 -60.4691098 ]; avg score: -63.61699516969918
learning timestep: 300000
Policy Loss: -0.17157305777072906
Value Loss: 1.5688683986663818
Entropy: -3.108181953430176
KL Divergence: 11.163786888122559
episode: [2349. 2411. 2390. 2420.]; total steps: 300000; episodes scores: [-38.72866137 -61.30986278 -62.973813   -64.87949238]; avg score: -56.97295738145677
episode: [2352. 2413. 2394. 2426.]; total steps: 301000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2360.0.mp4
episode rendered
Episode 1/1 - Score: [-26.92411976]
episode: [2360. 2418. 2404. 2437.]; total steps: 303000; episodes scores: [-57.01030158 -30.82657203 -23.40822372 -71.52216899]; avg score: -45.69181658078248
learning timestep: 304000
Policy Loss: -0.4040859043598175
Value Loss: 1.4213604927062988
Entropy: -3.0500924587249756
KL Divergence: 11.714723587036133
episode: [2364. 2422. 2407. 2441.]; total steps: 304000; episodes scores: [-51.30167205 -81.89732784 -73.7175324  -14.16873159]; avg score: -55.27131597012547
episode: [2368. 2426. 2411. 2443.]; total steps: 305000; episodes scores: [-73.64360446 -80.92185422 -53.09105629 -18.38634449]; avg score: -56.510714866119756
learning timestep: 306000
Policy Loss: 0.014105039648711681
Value Loss: 1.2937642335891724
Entropy: -3.0726518630981445
KL Divergence: 9.905179977416992
episode: [2373. 2430. 2413. 2446.]; total steps: 306000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2380.0.mp4
episode rendered
Episode 1/1 - Score: [-18.94329467]
episode: [2382. 2441. 2424. 2461.]; total steps: 309000; episodes scores: [-56.38885501 -23.62064895 -50.92304641 -55.08079569]; avg score: -46.5033365137886
learning timestep: 310000
Policy Loss: 0.05694533884525299
Value Loss: 2.7336390018463135
Entropy: -3.1994450092315674
KL Divergence: 10.807575225830078
episode: [2387. 2445. 2430. 2466.]; total steps: 310000; episodes scores: [-58.72510901 -34.50526454 -76.896078   -29.78016876]; avg score: -49.976655076565834
episode: [2391. 2447. 2435. 2468.]; total steps: 311000; episodes scores: [-27.59758567 -44.18953285 -89.68939883   7.34684699]; avg score: -38.53241759066951
learning timestep: 312000
Policy Loss: -0.017890099436044693
Value Loss: 1.1621818542480469
Entropy: -3.1685240268707275
KL Divergence: 12.861722946166992
episode: [2395. 2449. 2440. 2472.]; total steps: 312000; episodes scores

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2400.0.mp4
episode rendered
Episode 1/1 - Score: [-82.17343587]
learning timestep: 314000
Policy Loss: -0.17469444870948792
Value Loss: 0.9984925985336304
Entropy: -3.215426206588745
KL Divergence: 12.028707504272461
episode: [2402. 2454. 2448. 2478.]; total steps: 314000; episodes scores: [-20.18813293 -13.07419605 -64.10215736  14.99662583]; avg score: -20.59196512848094
episode: [2405. 2457. 2452. 2480.]; total steps: 315000; episodes scores: [-35.1066685  -58.39262081 -13.64527135 -27.45395181]; avg score: -33.64962811702911
learning timestep: 316000
Policy Loss: 0.24792762100696564
Value Loss: 1.6842412948608398
Entropy: -3.2254695892333984
KL Divergence: 12.032210350036621
episode: [2408. 2459. 2454. 2484.]; total steps: 316000; episodes scores: [ -8.82905761 -45.60683574 -37.4084414  -49.36168665]; avg score: -35.30150534990537
episode: [2410. 2460. 2457. 2487.]; total steps: 317000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2420.0.mp4
episode rendered
Episode 1/1 - Score: [-17.05239566]
learning timestep: 322000
Policy Loss: 0.056038748472929
Value Loss: 1.5818195343017578
Entropy: -3.238888740539551
KL Divergence: 12.5671968460083
episode: [2422. 2471. 2472. 2502.]; total steps: 322000; episodes scores: [-73.74464116 -35.13714235 -72.45953042 -70.47593869]; avg score: -62.95431315366473
episode: [2424. 2474. 2474. 2503.]; total steps: 323000; episodes scores: [ 43.89105605 -85.83175097 -52.28393078  55.44024686]; avg score: -9.696094709879397
learning timestep: 324000
Policy Loss: 0.008206816390156746
Value Loss: 1.4886035919189453
Entropy: -3.300351858139038
KL Divergence: 10.15047550201416
episode: [2428. 2475. 2478. 2505.]; total steps: 324000; episodes scores: [-33.24581001  84.95201466 -83.1954371  -84.71137403]; avg score: -29.05015161673867
episode: [2429. 2477. 2479. 2507.]; total steps: 325000; episodes scores: [ 84.7

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2440.0.mp4
episode rendered
Episode 1/1 - Score: [23.31512344]
episode: [2441. 2491. 2495. 2523.]; total steps: 331000; episodes scores: [-36.78548891 -49.212993    -8.88431966 -81.96820731]; avg score: -44.212752219045385
learning timestep: 332000
Policy Loss: -0.10689720511436462
Value Loss: 1.4944005012512207
Entropy: -3.158954381942749
KL Divergence: 11.126771926879883
episode: [2444. 2493. 2499. 2525.]; total steps: 332000; episodes scores: [-48.55246748 -27.60185325  -6.629668   -33.77505485]; avg score: -29.139760894488074
episode: [2447. 2495. 2501. 2527.]; total steps: 333000; episodes scores: [-64.61909227 -44.11012021 -62.5941452  -50.89160788]; avg score: -55.553741390098345
learning timestep: 334000
Policy Loss: 0.03611084073781967
Value Loss: 0.6485116481781006
Entropy: -3.324009418487549
KL Divergence: 11.066866874694824
episode: [2448. 2495. 2502. 2528.]; total steps: 334000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2460.0.mp4
episode rendered
Episode 1/1 - Score: [-43.6226432]
episode: [2461. 2506. 2514. 2538.]; total steps: 339000; episodes scores: [-57.10229851 -64.76387378 -55.37715541 -55.40977091]; avg score: -58.163274653964216
learning timestep: 340000
Policy Loss: 0.29544559121131897
Value Loss: 1.488443374633789
Entropy: -3.0915944576263428
KL Divergence: 11.486565589904785
episode: [2463. 2509. 2518. 2541.]; total steps: 340000; episodes scores: [ -9.59583963 -64.6984153  -63.07515178 -72.84296165]; avg score: -52.55309208711314
episode: [2465. 2511. 2521. 2542.]; total steps: 341000; episodes scores: [-19.2322224    1.71425007 -51.90853786 -21.05560147]; avg score: -22.620527914272134
learning timestep: 342000
Policy Loss: -0.2786182463169098
Value Loss: 1.2228091955184937
Entropy: -3.344200611114502
KL Divergence: 12.073022842407227
episode: [2468. 2513. 2523. 2544.]; total steps: 342000; episodes scores: [

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2480.0.mp4
episode rendered
Episode 1/1 - Score: [-62.56288185]
episode: [2482. 2521. 2537. 2557.]; total steps: 347000; episodes scores: [-13.25914578 -59.62953338 -34.27898406  61.7124638 ]; avg score: -11.36379985415708
learning timestep: 348000
Policy Loss: 0.24781031906604767
Value Loss: 1.0013399124145508
Entropy: -3.1898794174194336
KL Divergence: 12.388435363769531
episode: [2484. 2525. 2540. 2560.]; total steps: 348000; episodes scores: [ -3.85324664 -51.87620591 -82.85671815 -35.95071217]; avg score: -43.63422071755787
episode: [2487. 2529. 2543. 2563.]; total steps: 349000; episodes scores: [-30.63858818 -84.63824743 -64.56510516 -15.42273523]; avg score: -48.81616900067156
learning timestep: 350000
Policy Loss: -0.17010559141635895
Value Loss: 0.7484577894210815
Entropy: -3.276034355163574
KL Divergence: 12.590240478515625
episode: [2491. 2531. 2545. 2567.]; total steps: 350000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2500.0.mp4
episode rendered
Episode 1/1 - Score: [-69.29492941]
episode: [2500. 2541. 2555. 2577.]; total steps: 353000; episodes scores: [-60.78945708 -57.08479888 -41.92866866 -23.13480307]; avg score: -45.73443192509856
learning timestep: 354000
Policy Loss: -0.024492735043168068
Value Loss: 1.7698166370391846
Entropy: -3.269890069961548
KL Divergence: 12.207923889160156
episode: [2505. 2543. 2560. 2580.]; total steps: 354000; episodes scores: [ -58.51777571  -19.37115101 -103.75361439  -31.06165824]; avg score: -53.17604983633288
episode: [2508. 2546. 2562. 2582.]; total steps: 355000; episodes scores: [-49.25044433 -34.0912821   10.93252464  14.57885759]; avg score: -14.457586051921165
learning timestep: 356000
Policy Loss: -0.09200388938188553
Value Loss: 1.168527603149414
Entropy: -3.193516254425049
KL Divergence: 11.165458679199219
episode: [2510. 2548. 2564. 2586.]; total steps: 356000; episodes sco

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2520.0.mp4
episode rendered
Episode 1/1 - Score: [32.11403916]
episode: [2521. 2558. 2576. 2598.]; total steps: 361000; episodes scores: [-67.49279915 -78.84193198 -45.18742585  80.04214312]; avg score: -27.870003465597676
learning timestep: 362000
Policy Loss: -0.24173352122306824
Value Loss: 1.1760848760604858
Entropy: -3.2043235301971436
KL Divergence: 13.411857604980469
episode: [2524. 2560. 2579. 2599.]; total steps: 362000; episodes scores: [-55.96409053 -24.77266356 -67.55017342 125.32727303]; avg score: -5.739913621715466
episode: [2528. 2564. 2582. 2600.]; total steps: 363000; episodes scores: [  3.69021837 -20.54608541 -50.71899585  65.69193041]; avg score: -0.4707331202237697
learning timestep: 364000
Policy Loss: -0.0803474560379982
Value Loss: 0.5217248201370239
Entropy: -3.316645622253418
KL Divergence: 11.18496322631836
episode: [2530. 2564. 2584. 2604.]; total steps: 364000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2540.0.mp4
episode rendered
Episode 1/1 - Score: [-11.81835005]
episode: [2540. 2569. 2591. 2612.]; total steps: 367000; episodes scores: [ -52.69992979   51.66478998 -103.52507932  -50.60667507]; avg score: -38.79172355042199
learning timestep: 368000
Policy Loss: -0.013600155711174011
Value Loss: 1.0306339263916016
Entropy: -3.2402608394622803
KL Divergence: 14.28085708618164
episode: [2542. 2571. 2592. 2616.]; total steps: 368000; episodes scores: [ 39.57598576 -78.42834943  81.22495973 -49.33138328]; avg score: -1.7396968042457424
episode: [2544. 2574. 2594. 2618.]; total steps: 369000; episodes scores: [-50.73851136 -35.62918555 -51.61649949  57.87569191]; avg score: -20.027126120799807
learning timestep: 370000
Policy Loss: -0.11712923645973206
Value Loss: 1.305424690246582
Entropy: -3.451138734817505
KL Divergence: 13.339993476867676
episode: [2547. 2577. 2596. 2619.]; total steps: 370000; episodes sc

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2560.0.mp4
episode rendered
Episode 1/1 - Score: [44.58406401]
episode: [2561. 2595. 2612. 2636.]; total steps: 377000; episodes scores: [ 31.85384218 174.71054969 -12.81330865  71.41184179]; avg score: 66.29073125538028
learning timestep: 378000
Policy Loss: 0.052685659378767014
Value Loss: 0.5351332426071167
Entropy: -3.453073024749756
KL Divergence: 15.132438659667969
episode: [2562. 2595. 2613. 2637.]; total steps: 378000; episodes scores: [133.14368578 174.71054969  90.9841532   50.01337173]; avg score: 112.21294009788451
episode: [2564. 2598. 2614. 2640.]; total steps: 379000; episodes scores: [-29.59037016 -14.4017632  101.61891958 -34.2482057 ]; avg score: 5.844645129902769
learning timestep: 380000
Policy Loss: -0.06932010501623154
Value Loss: 1.05727219581604
Entropy: -3.4844627380371094
KL Divergence: 12.982808113098145
episode: [2566. 2599. 2616. 2641.]; total steps: 380000; episodes scores: [-47

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2580.0.mp4
episode rendered
Episode 1/1 - Score: [-66.75222725]
episode: [2581. 2612. 2630. 2654.]; total steps: 387000; episodes scores: [ -8.0658952   92.1168838  -30.12886202  56.79217705]; avg score: 27.678575905374153
learning timestep: 388000
Policy Loss: -0.14878247678279877
Value Loss: 1.2924576997756958
Entropy: -3.404890537261963
KL Divergence: 15.000345230102539
episode: [2584. 2613. 2633. 2657.]; total steps: 388000; episodes scores: [-25.1464454  303.98634743 -52.89634886 -15.83777558]; avg score: 52.526444395878286
episode: [2585. 2615. 2633. 2660.]; total steps: 389000; episodes scores: [ -9.6534879  -22.58083264 -52.89634886   3.17079094]; avg score: -20.489969615573777
learning timestep: 390000
Policy Loss: 0.02623763680458069
Value Loss: 1.9006186723709106
Entropy: -3.4078657627105713
KL Divergence: 13.003005981445312
episode: [2587. 2617. 2634. 2661.]; total steps: 390000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2600.0.mp4
episode rendered
Episode 1/1 - Score: [-8.63212497]
episode: [2603. 2630. 2652. 2676.]; total steps: 395000; episodes scores: [-45.63181816 -27.82477829 -67.00425178 -64.04590425]; avg score: -51.12668812012176
learning timestep: 396000
Policy Loss: 0.028653334826231003
Value Loss: 2.0674045085906982
Entropy: -3.4682085514068604
KL Divergence: 14.796457290649414
episode: [2607. 2632. 2656. 2679.]; total steps: 396000; episodes scores: [-47.64555211  63.84595147 -62.30902752 -48.48789927]; avg score: -23.649131855470863
episode: [2611. 2635. 2657. 2683.]; total steps: 397000; episodes scores: [-61.61535864 -59.55706254 -71.44370997 -51.8009994 ]; avg score: -61.10428263525989
learning timestep: 398000
Policy Loss: 0.14180020987987518
Value Loss: 1.1589481830596924
Entropy: -3.343235731124878
KL Divergence: 11.502189636230469
episode: [2614. 2639. 2660. 2685.]; total steps: 398000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2620.0.mp4
episode rendered
Episode 1/1 - Score: [-17.79854939]
learning timestep: 400000
Policy Loss: 0.15919536352157593
Value Loss: 1.5213158130645752
Entropy: -3.3795340061187744
KL Divergence: 12.479681015014648
episode: [2621. 2643. 2668. 2692.]; total steps: 400000; episodes scores: [-33.64673378 -34.81581121 -85.72100692 -22.49628563]; avg score: -44.1699593849221
episode: [2624. 2645. 2672. 2693.]; total steps: 401000; episodes scores: [-63.87059696 -12.11386156 -54.70286189 -23.00214795]; avg score: -38.422367089490415
learning timestep: 402000
Policy Loss: 0.0483916774392128
Value Loss: 1.0750854015350342
Entropy: -3.495561122894287
KL Divergence: 16.060977935791016
episode: [2629. 2649. 2677. 2696.]; total steps: 402000; episodes scores: [-64.32606401 -38.71278728 -86.92433881 -74.6651144 ]; avg score: -66.15707612680117
episode: [2633. 2649. 2679. 2699.]; total steps: 403000; episodes scores: [-

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2640.0.mp4
episode rendered
Episode 1/1 - Score: [-63.16806631]
learning timestep: 406000
Policy Loss: 0.011574894189834595
Value Loss: 1.709397554397583
Entropy: -3.235288143157959
KL Divergence: 14.87621021270752
episode: [2641. 2658. 2686. 2708.]; total steps: 406000; episodes scores: [ 77.14950794   2.00916375 -71.73669988 -65.26981649]; avg score: -14.46196116842669
episode: [2644. 2660. 2689. 2710.]; total steps: 407000; episodes scores: [-53.17320991 -43.41962643 -19.15258839 -43.71035237]; avg score: -39.86394427218692
learning timestep: 408000
Policy Loss: -0.11747930943965912
Value Loss: 2.322864532470703
Entropy: -3.5075740814208984
KL Divergence: 15.330377578735352
episode: [2649. 2664. 2690. 2713.]; total steps: 408000; episodes scores: [ -52.16261316  -58.79354867   -5.22300941 -103.34418959]; avg score: -54.88084020681913
episode: [2650. 2667. 2692. 2715.]; total steps: 409000; episodes scores

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2660.0.mp4
episode rendered
Episode 1/1 - Score: [-11.75007229]
episode: [2660. 2676. 2699. 2724.]; total steps: 413000; episodes scores: [-28.72207604 -59.34579788 -35.60492216 -26.41243996]; avg score: -37.52130901084948
learning timestep: 414000
Policy Loss: 0.18985405564308167
Value Loss: 1.4947031736373901
Entropy: -3.454803943634033
KL Divergence: 14.561664581298828
episode: [2663. 2678. 2701. 2725.]; total steps: 414000; episodes scores: [ -5.32057772  26.76020998 -26.08830171  64.7612847 ]; avg score: 15.0281538132766
episode: [2666. 2680. 2705. 2728.]; total steps: 415000; episodes scores: [-42.44604152 -52.9315312  -17.87026533 -36.68540192]; avg score: -37.483309993006586
learning timestep: 416000
Policy Loss: 0.1210094541311264
Value Loss: 1.6184483766555786
Entropy: -3.6024742126464844
KL Divergence: 16.136877059936523
episode: [2669. 2682. 2706. 2731.]; total steps: 416000; episodes scores: [-1

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2680.0.mp4
episode rendered
Episode 1/1 - Score: [-9.28350882]
episode: [2682. 2694. 2718. 2745.]; total steps: 421000; episodes scores: [-76.7607352  -34.51081114 -85.16374916  17.40526593]; avg score: -44.75750739101803
learning timestep: 422000
Policy Loss: -0.2795529067516327
Value Loss: 2.236124038696289
Entropy: -3.4742581844329834
KL Divergence: 14.40339183807373
episode: [2684. 2697. 2721. 2747.]; total steps: 422000; episodes scores: [-41.63721167  10.53396867 -79.32386171 -15.99155655]; avg score: -31.60466531341917
episode: [2686. 2698. 2725. 2749.]; total steps: 423000; episodes scores: [-68.8311908   95.26373901 -37.4513579   21.65981495]; avg score: 2.660251316190797
learning timestep: 424000
Policy Loss: -0.012925430200994015
Value Loss: 1.6728270053863525
Entropy: -3.5110344886779785
KL Divergence: 15.269951820373535
episode: [2689. 2700. 2727. 2752.]; total steps: 424000; episodes scores: [-

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2700.0.mp4
episode rendered
Episode 1/1 - Score: [39.25155677]
learning timestep: 430000
Policy Loss: -0.04992292448878288
Value Loss: 1.3637851476669312
Entropy: -3.4976935386657715
KL Divergence: 12.395109176635742
episode: [2703. 2711. 2739. 2761.]; total steps: 430000; episodes scores: [-28.17419146 128.69929175 -69.15323088   6.64923058]; avg score: 9.505275000281472
episode: [2704. 2712. 2740. 2762.]; total steps: 431000; episodes scores: [  6.97674724  80.71857338 131.36719101  62.47962946]; avg score: 70.38553527172131
learning timestep: 432000
Policy Loss: -0.05902840569615364
Value Loss: 0.5730433464050293
Entropy: -3.6217241287231445
KL Divergence: 15.14273738861084
episode: [2706. 2713. 2741. 2765.]; total steps: 432000; episodes scores: [-56.22501005  31.82648344 109.62417712 -48.68345496]; avg score: 9.135548888828144
episode: [2707. 2716. 2744. 2766.]; total steps: 433000; episodes scores: [30

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2720.0.mp4
episode rendered
Episode 1/1 - Score: [81.14354342]
learning timestep: 442000
Policy Loss: 0.10499094426631927
Value Loss: 5.3012261390686035
Entropy: -3.569121837615967
KL Divergence: 13.782392501831055
episode: [2720. 2730. 2753. 2776.]; total steps: 442000; episodes scores: [-35.37519776 144.38336103 301.36847014 306.21671592]; avg score: 179.14833733404504
episode: [2721. 2732. 2754. 2778.]; total steps: 443000; episodes scores: [19.45996765  2.55196947 23.16112763 68.47211009]; avg score: 28.4112937098186
learning timestep: 444000
Policy Loss: 0.08743393421173096
Value Loss: 1.3562724590301514
Entropy: -3.561124324798584
KL Divergence: 16.396726608276367
episode: [2722. 2734. 2755. 2779.]; total steps: 444000; episodes scores: [303.76390067  28.31813666 304.63925215 -60.51333132]; avg score: 144.05198953829742
episode: [2723. 2735. 2756. 2780.]; total steps: 445000; episodes scores: [ 62.4183

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2740.0.mp4
episode rendered
Episode 1/1 - Score: [154.89222749]
episode: [2741. 2753. 2775. 2798.]; total steps: 457000; episodes scores: [-53.762463   -17.52785831 -59.65990804 -53.85173606]; avg score: -46.20049135254745
learning timestep: 458000
Policy Loss: -0.15336903929710388
Value Loss: 0.5959327816963196
Entropy: -3.3630599975585938
KL Divergence: 15.046255111694336
episode: [2742. 2755. 2776. 2799.]; total steps: 458000; episodes scores: [ 77.896112   -27.20072835  97.53368133 309.12002724]; avg score: 114.33727305466621
episode: [2743. 2757. 2779. 2801.]; total steps: 459000; episodes scores: [175.52026902 -60.67995614 -76.44613307  87.56973166]; avg score: 31.49097786807637
learning timestep: 460000
Policy Loss: -0.03973975032567978
Value Loss: 0.8161295652389526
Entropy: -3.5452356338500977
KL Divergence: 13.942085266113281
episode: [2745. 2760. 2780. 2806.]; total steps: 460000; episodes scores:

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2760.0.mp4
episode rendered
Episode 1/1 - Score: [72.44636628]
episode: [2761. 2778. 2793. 2820.]; total steps: 473000; episodes scores: [-76.83954178 306.97178261 309.8576404  136.36007134]; avg score: 169.08748814265385
learning timestep: 474000
Policy Loss: -0.049503426998853683
Value Loss: 1.6877238750457764
Entropy: -3.421288013458252
KL Divergence: 15.884344100952148
episode: [2763. 2781. 2794. 2822.]; total steps: 474000; episodes scores: [ 46.4065712  -12.64033529   3.02689307 -44.66979747]; avg score: -1.9691671201935819
episode: [2764. 2781. 2796. 2824.]; total steps: 475000; episodes scores: [307.44526029 -12.64033529  28.98855851  91.84064575]; avg score: 103.9085323150985
learning timestep: 476000
Policy Loss: -0.11571267247200012
Value Loss: 1.176340103149414
Entropy: -3.4385247230529785
KL Divergence: 13.437826156616211
episode: [2764. 2783. 2797. 2825.]; total steps: 476000; episodes scores: 

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2780.0.mp4
episode rendered
Episode 1/1 - Score: [162.93875636]
episode: [2780. 2801. 2811. 2840.]; total steps: 487000; episodes scores: [ 13.92218401 -38.83651822 -51.07864114 -43.40828956]; avg score: -29.850316227212414
learning timestep: 488000
Policy Loss: -0.04969726502895355
Value Loss: 1.7587928771972656
Entropy: -3.4338088035583496
KL Divergence: 13.830368995666504
episode: [2782. 2804. 2813. 2843.]; total steps: 488000; episodes scores: [-75.72582199 -42.55155604  15.93261685 -12.07937896]; avg score: -28.606035033619825
episode: [2783. 2806. 2815. 2844.]; total steps: 489000; episodes scores: [162.40386429  40.79382577 -75.86049226 132.22730129]; avg score: 64.89112477373442
learning timestep: 490000
Policy Loss: -0.25245401263237
Value Loss: 1.4591941833496094
Entropy: -3.438976287841797
KL Divergence: 13.33250617980957
episode: [2786. 2807. 2817. 2846.]; total steps: 490000; episodes scores: [-

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2800.0.mp4
episode rendered
Episode 1/1 - Score: [52.74078974]
episode: [2800. 2817. 2828. 2860.]; total steps: 497000; episodes scores: [-49.15839891  42.57056268  93.96516687  32.51909414]; avg score: 29.974106193550647
learning timestep: 498000
Policy Loss: 0.04046614468097687
Value Loss: 2.3902087211608887
Entropy: -3.6207051277160645
KL Divergence: 15.245955467224121
episode: [2802. 2819. 2829. 2862.]; total steps: 498000; episodes scores: [ -1.63241284 -15.60199282  49.4913245   12.29588886]; avg score: 11.138201923296357
episode: [2804. 2822. 2830. 2864.]; total steps: 499000; episodes scores: [-47.43280058  15.34940653  28.14416348  60.36442063]; avg score: 14.106297515029803
learning timestep: 500000
Policy Loss: 0.06441615521907806
Value Loss: 2.226865291595459
Entropy: -3.473778486251831
KL Divergence: 17.435619354248047
episode: [2807. 2824. 2832. 2865.]; total steps: 500000; episodes scores: [-5

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2820.0.mp4
episode rendered
Episode 1/1 - Score: [311.48323205]
episode: [2822. 2836. 2844. 2880.]; total steps: 507000; episodes scores: [ -77.08248189    3.02486112   63.00647324 -105.96035115]; avg score: -29.252874668604317
learning timestep: 508000
Policy Loss: 0.07037211209535599
Value Loss: 2.927267551422119
Entropy: -3.4710166454315186
KL Divergence: 15.163650512695312
episode: [2823. 2837. 2845. 2881.]; total steps: 508000; episodes scores: [311.88697108  83.07363812  18.81411579   3.04625231]; avg score: 104.20524432434819
episode: [2824. 2840. 2846. 2882.]; total steps: 509000; episodes scores: [152.46072615 -78.21385161 163.29634082 103.87089976]; avg score: 85.3535287799707
learning timestep: 510000
Policy Loss: -0.020328959450125694
Value Loss: 2.225741147994995
Entropy: -3.5112524032592773
KL Divergence: 17.122455596923828
episode: [2827. 2841. 2847. 2883.]; total steps: 510000; episodes score

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2840.0.mp4
episode rendered
Episode 1/1 - Score: [-59.3938479]
learning timestep: 516000
Policy Loss: -0.014265727251768112
Value Loss: 2.758664608001709
Entropy: -3.5924315452575684
KL Divergence: 15.167126655578613
episode: [2840. 2855. 2860. 2895.]; total steps: 516000; episodes scores: [-77.39376431 -71.61803482 -76.63502942  24.89503952]; avg score: -50.187947260969
episode: [2842. 2858. 2861. 2897.]; total steps: 517000; episodes scores: [-14.45433199 -10.68256704 309.76384457  -2.61205927]; avg score: 70.50372156434406
learning timestep: 518000
Policy Loss: -0.014389793388545513
Value Loss: 1.110002040863037
Entropy: -3.538435935974121
KL Divergence: 16.79971694946289
episode: [2843. 2859. 2862. 2899.]; total steps: 518000; episodes scores: [159.95699827 154.99059121 145.12640893 -79.661916  ]; avg score: 95.10302060178184
episode: [2844. 2861. 2863. 2900.]; total steps: 519000; episodes scores: [148.

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2860.0.mp4
episode rendered
Episode 1/1 - Score: [312.4243571]
episode: [2860. 2870. 2879. 2920.]; total steps: 527000; episodes scores: [-66.13053534 127.70822214  55.05582521  46.14032357]; avg score: 40.69345889397357
learning timestep: 528000
Policy Loss: 0.11023762077093124
Value Loss: 2.14155912399292
Entropy: -3.405825614929199
KL Divergence: 17.392244338989258
episode: [2862. 2873. 2881. 2922.]; total steps: 528000; episodes scores: [-51.91042161  31.08976983  93.89852742   4.44115189]; avg score: 19.379756881310964
episode: [2862. 2874. 2881. 2924.]; total steps: 529000; episodes scores: [-51.91042161  98.01544685  93.89852742 -19.24725735]; avg score: 30.18907382519246
learning timestep: 530000
Policy Loss: 0.2477133423089981
Value Loss: 4.403156757354736
Entropy: -3.6704721450805664
KL Divergence: 17.653966903686523
episode: [2865. 2875. 2882. 2926.]; total steps: 530000; episodes scores: [-67.423

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2880.0.mp4
episode rendered
Episode 1/1 - Score: [43.86306557]
episode: [2881. 2890. 2898. 2941.]; total steps: 539000; episodes scores: [-41.31458263 -58.74850848 -17.71370169  92.77961183]; avg score: -6.24929524447041
learning timestep: 540000
Policy Loss: -0.05602848157286644
Value Loss: 2.1424856185913086
Entropy: -3.6149802207946777
KL Divergence: 17.458580017089844
episode: [2885. 2893. 2900. 2943.]; total steps: 540000; episodes scores: [-72.34116613 -20.09611747   7.45631208 -13.21681431]; avg score: -24.54944645975642
episode: [2886. 2894. 2902. 2944.]; total steps: 541000; episodes scores: [ 98.16074286 311.58912827  63.27525111 311.47167439]; avg score: 196.12419915668653
learning timestep: 542000
Policy Loss: -0.08672332018613815
Value Loss: 1.1992177963256836
Entropy: -3.677760124206543
KL Divergence: 15.639127731323242
episode: [2887. 2897. 2903. 2945.]; total steps: 542000; episodes scores: [

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2900.0.mp4
episode rendered
Episode 1/1 - Score: [312.97895009]
learning timestep: 550000
Policy Loss: 0.3448040783405304
Value Loss: 2.5024852752685547
Entropy: -3.5920825004577637
KL Divergence: 17.613765716552734
episode: [2900. 2912. 2919. 2962.]; total steps: 550000; episodes scores: [145.55557999 -11.54395009  96.42475943 -16.4849382 ]; avg score: 53.48786278545875
episode: [2900. 2913. 2920. 2963.]; total steps: 551000; episodes scores: [145.55557999  19.17906989 121.13362214 312.4729833 ]; avg score: 149.58531383152393
learning timestep: 552000
Policy Loss: -0.005189662799239159
Value Loss: 1.9822516441345215
Entropy: -3.671875
KL Divergence: 16.381500244140625
episode: [2903. 2914. 2922. 2965.]; total steps: 552000; episodes scores: [  15.25944614  313.71951629   92.30555162 -106.27639508]; avg score: 78.75202974060801
episode: [2905. 2916. 2923. 2966.]; total steps: 553000; episodes scores: [-105.2

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2920.0.mp4
episode rendered
Episode 1/1 - Score: [311.93050987]
episode: [2920. 2930. 2938. 2980.]; total steps: 563000; episodes scores: [ 311.71768962  311.92856864  311.25684327 -105.59957551]; avg score: 207.32588150171662
learning timestep: 564000
Policy Loss: 0.17750950157642365
Value Loss: 9.655601501464844
Entropy: -3.5806362628936768
KL Divergence: 17.500728607177734
episode: [2921. 2931. 2940. 2981.]; total steps: 564000; episodes scores: [  2.08487006  -9.43692912 -27.42019289 122.9073478 ]; avg score: 22.033773963167228
episode: [2923. 2932. 2941. 2981.]; total steps: 565000; episodes scores: [ 36.40520176 311.22372203 311.82719262 122.9073478 ]; avg score: 195.59086605281237
learning timestep: 566000
Policy Loss: -0.17984892427921295
Value Loss: 1.1462900638580322
Entropy: -3.6836369037628174
KL Divergence: 17.442541122436523
episode: [2924. 2934. 2942. 2983.]; total steps: 566000; episodes scor

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2940.0.mp4
episode rendered
Episode 1/1 - Score: [314.11643427]
learning timestep: 580000
Policy Loss: -0.0010248234029859304
Value Loss: 1.029537320137024
Entropy: -3.815364122390747
KL Divergence: 18.974611282348633
episode: [2940. 2948. 2959. 2998.]; total steps: 580000; episodes scores: [  8.05163869  76.78156044 -40.96197847 126.83319204]; avg score: 42.67610317776502
episode: [2941. 2950. 2960. 2998.]; total steps: 581000; episodes scores: [177.70144559  79.64202144   8.17306695 126.83319204]; avg score: 98.08743150737021
learning timestep: 582000
Policy Loss: -0.050936076790094376
Value Loss: 0.862544059753418
Entropy: -3.6508328914642334
KL Divergence: 14.994202613830566
episode: [2942. 2951. 2961. 2999.]; total steps: 582000; episodes scores: [312.24762951 159.56897703 310.65341816 313.65452242]; avg score: 274.03113677936966
episode: [2943. 2952. 2962. 3001.]; total steps: 583000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2960.0.mp4
episode rendered
Episode 1/1 - Score: [43.52451311]
episode: [2960. 2965. 2972. 3013.]; total steps: 595000; episodes scores: [116.21402973 313.17882769 312.31630181 311.27964934]; avg score: 263.24720214333587
learning timestep: 596000
Policy Loss: -0.18701232969760895
Value Loss: 2.252796173095703
Entropy: -3.730372190475464
KL Divergence: 15.468265533447266
episode: [2960. 2966. 2973. 3014.]; total steps: 596000; episodes scores: [116.21402973 310.93125488 312.10243747 139.72101848]; avg score: 219.74218513762747
episode: [2961. 2966. 2974. 3015.]; total steps: 597000; episodes scores: [311.15013917 310.93125488  63.95479995 311.41236815]; avg score: 249.36214053964085
learning timestep: 598000
Policy Loss: 0.00048639788292348385
Value Loss: 1.299876093864441
Entropy: -3.588715076446533
KL Divergence: 14.677412986755371
episode: [2963. 2968. 2975. 3018.]; total steps: 598000; episodes scores: [

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_2980.0.mp4
episode rendered
Episode 1/1 - Score: [311.57973316]
learning timestep: 610000
Policy Loss: 0.04191167652606964
Value Loss: 1.2515928745269775
Entropy: -3.8003554344177246
KL Divergence: 16.310457229614258
episode: [2981. 2981. 2990. 3036.]; total steps: 610000; episodes scores: [-21.08405906  67.3848857   -8.46747603  28.38555963]; avg score: 16.55472756003146
episode: [2982. 2982. 2992. 3037.]; total steps: 611000; episodes scores: [312.21728442 313.14755229  31.04379454 312.3429361 ]; avg score: 242.18789183705994
learning timestep: 612000
Policy Loss: 0.03723413869738579
Value Loss: 8.570172309875488
Entropy: -3.5899219512939453
KL Divergence: 14.360932350158691
episode: [2984. 2984. 2994. 3039.]; total steps: 612000; episodes scores: [-82.80596034   5.19079032 -45.67949308 -28.72033963]; avg score: -38.00375068170905
episode: [2985. 2984. 2995. 3040.]; total steps: 613000; episodes scores: [ 

                                                                

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3000.0.mp4
episode rendered
Episode 1/1 - Score: [172.0831803]
learning timestep: 624000
Policy Loss: -0.20259475708007812
Value Loss: 0.9130328893661499
Entropy: -3.847393274307251
KL Divergence: 16.727455139160156
episode: [3000. 3000. 3009. 3055.]; total steps: 624000; episodes scores: [128.38494508  83.19263215  45.31562922   0.93520335]; avg score: 64.45710245259576
episode: [3003. 3001. 3011. 3056.]; total steps: 625000; episodes scores: [-21.0369755  313.68434402  63.32137967 314.2226411 ]; avg score: 167.54784732349032
learning timestep: 626000
Policy Loss: 0.04005631431937218
Value Loss: 1.1598228216171265
Entropy: -3.724675178527832
KL Divergence: 16.229778289794922
episode: [3004. 3001. 3014. 3057.]; total steps: 626000; episodes scores: [ 31.45503061 313.68434402 -49.19679822 210.43997998]; avg score: 126.59563910081077
episode: [3007. 3002. 3014. 3058.]; total steps: 627000; episodes scores: [-4

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3020.0.mp4
episode rendered
Episode 1/1 - Score: [311.76446133]
episode: [3021. 3014. 3028. 3075.]; total steps: 639000; episodes scores: [-109.21607859    7.21853603  311.91475517   69.05452705]; avg score: 69.74293491454598
learning timestep: 640000
Policy Loss: -0.06153826788067818
Value Loss: 2.6806297302246094
Entropy: -3.8244895935058594
KL Divergence: 16.812273025512695
episode: [3022. 3015. 3029. 3075.]; total steps: 640000; episodes scores: [105.06499138 310.90930026 117.34593483  69.05452705]; avg score: 150.59368837986938
episode: [3023. 3017. 3030. 3076.]; total steps: 641000; episodes scores: [312.67187921  42.38858577 312.49586406 312.49020358]; avg score: 245.01163315613883
learning timestep: 642000
Policy Loss: -0.10221210867166519
Value Loss: 2.9410884380340576
Entropy: -3.5772929191589355
KL Divergence: 16.036794662475586
episode: [3024. 3018. 3030. 3078.]; total steps: 642000; episodes sco

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3040.0.mp4
episode rendered
Episode 1/1 - Score: [3.24020006]
episode: [3040. 3036. 3045. 3095.]; total steps: 657000; episodes scores: [ 89.42275384 313.0920013  124.23599288  -0.50104492]; avg score: 131.56242577393843
learning timestep: 658000
Policy Loss: -0.18561510741710663
Value Loss: 0.37514254450798035
Entropy: -3.854501485824585
KL Divergence: 17.066360473632812
episode: [3040. 3038. 3046. 3095.]; total steps: 658000; episodes scores: [ 89.42275384  16.06197519 312.44352464  -0.50104492]; avg score: 104.35680218700739
episode: [3043. 3040. 3048. 3097.]; total steps: 659000; episodes scores: [-18.03189367  12.08059025  50.72108503  93.30938094]; avg score: 34.51979063696787
learning timestep: 660000
Policy Loss: -0.06252356618642807
Value Loss: 0.8693183660507202
Entropy: -3.826188564300537
KL Divergence: 16.741703033447266
episode: [3044. 3041. 3048. 3098.]; total steps: 660000; episodes scores: [1

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3060.0.mp4
episode rendered
Episode 1/1 - Score: [312.85985278]
episode: [3060. 3055. 3064. 3111.]; total steps: 673000; episodes scores: [312.81241478 -37.28114043 173.22501835 312.40090449]; avg score: 190.28929929740048
learning timestep: 674000
Policy Loss: -0.015437867492437363
Value Loss: 1.4573725461959839
Entropy: -3.7682225704193115
KL Divergence: 20.28260040283203
episode: [3061. 3057. 3066. 3112.]; total steps: 674000; episodes scores: [143.61487679  27.84775065  28.3399181    5.57518063]; avg score: 51.34443154162573
episode: [3063. 3059. 3067. 3116.]; total steps: 675000; episodes scores: [-76.53112699  67.76320271 168.40919735 -47.49878918]; avg score: 28.035620973306337
learning timestep: 676000
Policy Loss: 0.09616871178150177
Value Loss: 1.6018245220184326
Entropy: -3.7444353103637695
KL Divergence: 17.575124740600586
episode: [3064. 3061. 3069. 3117.]; total steps: 676000; episodes scores: 

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3080.0.mp4
episode rendered
Episode 1/1 - Score: [314.47372602]
episode: [3080. 3080. 3087. 3134.]; total steps: 689000; episodes scores: [-20.45406029 -71.3866836   81.51554207 142.22480507]; avg score: 32.97490081215641
learning timestep: 690000
Policy Loss: -0.05707457289099693
Value Loss: 0.9650827646255493
Entropy: -3.9221482276916504
KL Divergence: 15.804854393005371
episode: [3082. 3081. 3089. 3134.]; total steps: 690000; episodes scores: [-51.0040297  313.56178554 -13.54359047 142.22480507]; avg score: 97.8097426094418
episode: [3083. 3082. 3091. 3135.]; total steps: 691000; episodes scores: [132.74120099  -0.84225489   9.61971939 313.94344942]; avg score: 113.86552872933134
learning timestep: 692000
Policy Loss: 0.04648706316947937
Value Loss: 4.121382713317871
Entropy: -3.749843120574951
KL Divergence: 15.989362716674805
episode: [3084. 3084. 3092. 3137.]; total steps: 692000; episodes scores: [313

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3100.0.mp4
episode rendered
Episode 1/1 - Score: [-1.74782058]
learning timestep: 704000
Policy Loss: -0.05560417100787163
Value Loss: 1.430938482284546
Entropy: -3.7381935119628906
KL Divergence: 14.741450309753418
episode: [3101. 3101. 3104. 3155.]; total steps: 704000; episodes scores: [-46.325198    34.63669336  11.62298038 313.71097962]; avg score: 78.4113638410758
episode: [3102. 3102. 3105. 3157.]; total steps: 705000; episodes scores: [  2.27427015 314.97191454 155.1369292  -17.31032201]; avg score: 113.7681979730987
learning timestep: 706000
Policy Loss: -0.052751194685697556
Value Loss: 1.532775640487671
Entropy: -3.992861747741699
KL Divergence: 20.009145736694336
episode: [3103. 3103. 3107. 3158.]; total steps: 706000; episodes scores: [314.08022178  71.32930986  90.6859676   76.77362119]; avg score: 138.21728010863148
episode: [3104. 3104. 3108. 3159.]; total steps: 707000; episodes scores: [315

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3120.0.mp4
episode rendered
Episode 1/1 - Score: [51.97081874]
learning timestep: 718000
Policy Loss: -0.050487618893384933
Value Loss: 0.7386610507965088
Entropy: -3.794233798980713
KL Divergence: 23.288789749145508
episode: [3122. 3121. 3130. 3174.]; total steps: 718000; episodes scores: [-57.76558158 -12.87991497  96.7021013  316.32362199]; avg score: 85.59505668411143
episode: [3123. 3123. 3132. 3176.]; total steps: 719000; episodes scores: [315.87081254  37.84693281   1.61140717 -23.54383976]; avg score: 82.94632819068
learning timestep: 720000
Policy Loss: 0.11954686790704727
Value Loss: 2.6645169258117676
Entropy: -3.930811882019043
KL Divergence: 22.247692108154297
episode: [3124. 3124. 3134. 3178.]; total steps: 720000; episodes scores: [ 79.12973626 131.87824197  97.66475726 -76.46157307]; avg score: 58.05279060203411
episode: [3127. 3126. 3135. 3180.]; total steps: 721000; episodes scores: [-50.64

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3140.0.mp4
episode rendered
Episode 1/1 - Score: [2.24117061]
episode: [3141. 3141. 3152. 3198.]; total steps: 733000; episodes scores: [ 29.17447698 314.13389596 171.19177192 146.08947453]; avg score: 165.14740484652583
learning timestep: 734000
Policy Loss: 0.003293613437563181
Value Loss: 7.600711822509766
Entropy: -3.713885545730591
KL Divergence: 13.961363792419434
episode: [3141. 3142. 3154. 3200.]; total steps: 734000; episodes scores: [ 29.17447698 141.81936696   0.43663306  -9.5136114 ]; avg score: 40.47921640026995
episode: [3142. 3143. 3154. 3200.]; total steps: 735000; episodes scores: [316.56671256 315.8043376    0.43663306  -9.5136114 ]; avg score: 155.82351795527407
learning timestep: 736000
Policy Loss: 0.012361516244709492
Value Loss: 2.4696950912475586
Entropy: -3.6868736743927
KL Divergence: 15.213072776794434
episode: [3144. 3146. 3155. 3202.]; total steps: 736000; episodes scores: [ 83.4

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3160.0.mp4
episode rendered
Episode 1/1 - Score: [313.51099639]
learning timestep: 752000
Policy Loss: 0.2566719055175781
Value Loss: 9.780878067016602
Entropy: -3.8866355419158936
KL Divergence: 18.85742950439453
episode: [3160. 3166. 3175. 3220.]; total steps: 752000; episodes scores: [314.39396295 313.55110412 315.19953404  65.99545125]; avg score: 252.28501308801657
episode: [3161. 3168. 3176. 3221.]; total steps: 753000; episodes scores: [193.14759569 -13.4634623  313.72721144 314.01796425]; avg score: 201.85732726914557
learning timestep: 754000
Policy Loss: 0.13352692127227783
Value Loss: 9.783157348632812
Entropy: -3.6932055950164795
KL Divergence: 18.10167121887207
episode: [3162. 3169. 3178. 3222.]; total steps: 754000; episodes scores: [313.66641808 184.92070301  -1.05626395  13.55436925]; avg score: 127.77130659861403
episode: [3163. 3172. 3179. 3223.]; total steps: 755000; episodes scores: [314.

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3180.0.mp4
episode rendered
Episode 1/1 - Score: [314.95474505]
episode: [3180. 3186. 3192. 3240.]; total steps: 765000; episodes scores: [171.07830998  37.33217953 -37.37563807  -0.65157389]; avg score: 42.59581938801093
learning timestep: 766000
Policy Loss: -0.16776491701602936
Value Loss: 1.6904165744781494
Entropy: -3.825624465942383
KL Divergence: 22.691444396972656
episode: [3180. 3187. 3193. 3241.]; total steps: 766000; episodes scores: [171.07830998  81.82787934 315.32116002 316.69697152]; avg score: 221.23108021198993
episode: [3181. 3188. 3195. 3242.]; total steps: 767000; episodes scores: [313.47574933 186.39595495 -48.68406161 117.50809759]; avg score: 142.17393506405293
learning timestep: 768000
Policy Loss: -0.09239150583744049
Value Loss: 1.0744431018829346
Entropy: -3.8352432250976562
KL Divergence: 18.380895614624023
episode: [3182. 3189. 3196. 3243.]; total steps: 768000; episodes scores: 

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3200.0.mp4
episode rendered
Episode 1/1 - Score: [102.19546821]
learning timestep: 784000
Policy Loss: -0.06745795160531998
Value Loss: 3.998114585876465
Entropy: -3.8775720596313477
KL Divergence: 20.49707794189453
episode: [3201. 3209. 3219. 3264.]; total steps: 784000; episodes scores: [ 14.20227151 315.56314759  43.89343935 315.44708147]; avg score: 172.27648497929084
episode: [3202. 3210. 3220. 3265.]; total steps: 785000; episodes scores: [314.84404458 315.0023817   76.17338041 315.00368696]; avg score: 255.25587341203698
learning timestep: 786000
Policy Loss: 0.08601474016904831
Value Loss: 7.773627281188965
Entropy: -3.871039628982544
KL Divergence: 18.575162887573242
episode: [3203. 3211. 3221. 3266.]; total steps: 786000; episodes scores: [125.47400009  39.93193402 314.23202707 316.36992776]; avg score: 199.00197223325875
episode: [3207. 3212. 3222. 3267.]; total steps: 787000; episodes scores: [-2

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3220.0.mp4
episode rendered
Episode 1/1 - Score: [317.16466773]
episode: [3220. 3225. 3235. 3280.]; total steps: 797000; episodes scores: [  98.16807411  189.77130799  133.85372369 -109.37135027]; avg score: 78.10543888121369
learning timestep: 798000
Policy Loss: 0.22042085230350494
Value Loss: 7.09515380859375
Entropy: -3.9143600463867188
KL Divergence: 19.1748104095459
episode: [3222. 3226. 3236. 3280.]; total steps: 798000; episodes scores: [ -20.8158326   316.62100785   41.3056506  -109.37135027]; avg score: 56.934868895134244
episode: [3223. 3227. 3238. 3281.]; total steps: 799000; episodes scores: [316.41157135 316.09234473  27.89588257 315.21925773]; avg score: 243.90476409477247
learning timestep: 800000
Policy Loss: -0.10095246136188507
Value Loss: 3.510944128036499
Entropy: -3.7940783500671387
KL Divergence: 13.184600830078125
episode: [3225. 3229. 3239. 3282.]; total steps: 800000; episodes score

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3240.0.mp4
episode rendered
Episode 1/1 - Score: [313.75932665]
episode: [3240. 3243. 3255. 3298.]; total steps: 813000; episodes scores: [315.61507336 162.98975276 315.90252399 314.62377906]; avg score: 277.2827822902907
learning timestep: 814000
Policy Loss: 0.15310369431972504
Value Loss: 8.212224960327148
Entropy: -3.8587310314178467
KL Divergence: 20.203052520751953
episode: [3241. 3243. 3255. 3300.]; total steps: 814000; episodes scores: [ 27.77126264 162.98975276 315.90252399 -34.00049386]; avg score: 118.16576137986914
episode: [3242. 3244. 3256. 3301.]; total steps: 815000; episodes scores: [314.36655909 314.91032452 314.81988178 315.37905801]; avg score: 314.86895584840215
learning timestep: 816000
Policy Loss: -0.12478954344987869
Value Loss: 1.0597491264343262
Entropy: -3.6003808975219727
KL Divergence: 16.35373878479004
episode: [3243. 3245. 3257. 3302.]; total steps: 816000; episodes scores: [3

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3260.0.mp4
episode rendered
Episode 1/1 - Score: [314.80308717]
learning timestep: 832000
Policy Loss: -0.09838683903217316
Value Loss: 0.803752601146698
Entropy: -4.046159744262695
KL Divergence: 19.57610511779785
episode: [3260. 3264. 3273. 3320.]; total steps: 832000; episodes scores: [315.44498766 314.85753079 202.67279039 316.0305895 ]; avg score: 287.2514745866919
episode: [3261. 3265. 3274. 3321.]; total steps: 833000; episodes scores: [172.31674199 313.75776302 314.51136856 314.08009768]; avg score: 278.66649281469466
learning timestep: 834000
Policy Loss: -0.12138630449771881
Value Loss: 0.38279685378074646
Entropy: -3.951089382171631
KL Divergence: 20.82535743713379
episode: [3264. 3266. 3276. 3323.]; total steps: 834000; episodes scores: [-75.50907041 178.43488889 -40.29913721  48.80601123]; avg score: 27.858173125986667
episode: [3265. 3266. 3277. 3323.]; total steps: 835000; episodes scores: [31

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3280.0.mp4
episode rendered
Episode 1/1 - Score: [315.88779047]
learning timestep: 846000
Policy Loss: -0.08296018093824387
Value Loss: 1.607154130935669
Entropy: -3.7294375896453857
KL Divergence: 15.114433288574219
episode: [3280. 3280. 3287. 3335.]; total steps: 846000; episodes scores: [ 13.0474368  107.22245982 316.33359166 316.14191414]; avg score: 188.1863506057817
episode: [3281. 3282. 3288. 3338.]; total steps: 847000; episodes scores: [315.22936524 105.04025706 316.58928817 -13.45674953]; avg score: 180.8505402339406
learning timestep: 848000
Policy Loss: -0.051155541092157364
Value Loss: 2.0659196376800537
Entropy: -4.014094352722168
KL Divergence: 17.09247589111328
episode: [3282. 3284. 3289. 3339.]; total steps: 848000; episodes scores: [ 80.2795621   -9.27627008 122.61105846 164.6701755 ]; avg score: 89.57113149689285
episode: [3283. 3285. 3290. 3340.]; total steps: 849000; episodes scores: [31

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3300.0.mp4
episode rendered
Episode 1/1 - Score: [209.32849262]
episode: [3302. 3302. 3308. 3356.]; total steps: 863000; episodes scores: [-112.35647812  115.5926628   315.18845615   -6.68548582]; avg score: 77.93478875339719
learning timestep: 864000
Policy Loss: -0.1357293576002121
Value Loss: 0.5657939314842224
Entropy: -3.6806278228759766
KL Divergence: 14.042125701904297
episode: [3303. 3305. 3309. 3357.]; total steps: 864000; episodes scores: [315.51447016 -42.28189113 188.01127062 315.65746011]; avg score: 194.22532743942116
episode: [3304. 3305. 3310. 3358.]; total steps: 865000; episodes scores: [314.23499478 -42.28189113 315.40811843 317.44431734]; avg score: 226.2013848553931
learning timestep: 866000
Policy Loss: -0.1091180071234703
Value Loss: 0.36060142517089844
Entropy: -3.974686622619629
KL Divergence: 18.633848190307617
episode: [3305. 3308. 3311. 3359.]; total steps: 866000; episodes scores

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3320.0.mp4
episode rendered
Episode 1/1 - Score: [316.05488639]
episode: [3320. 3322. 3330. 3373.]; total steps: 881000; episodes scores: [315.36193265 186.58002447 315.69663848 316.95085677]; avg score: 283.64736309291425
learning timestep: 882000
Policy Loss: -0.01297600008547306
Value Loss: 7.447146415710449
Entropy: -3.575497627258301
KL Divergence: 18.161828994750977
episode: [3321. 3323. 3331. 3375.]; total steps: 882000; episodes scores: [126.81497042 317.96986744 316.02413894 -51.70857114]; avg score: 177.2751014152225
episode: [3323. 3325. 3333. 3376.]; total steps: 883000; episodes scores: [ 46.23082142 -44.64763563 -51.40594799 315.51279159]; avg score: 66.42250734696492
learning timestep: 884000
Policy Loss: 0.07409293204545975
Value Loss: 1.2975196838378906
Entropy: -4.094635963439941
KL Divergence: 19.910797119140625
episode: [3325. 3328. 3334. 3376.]; total steps: 884000; episodes scores: [  9

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3340.0.mp4
episode rendered
Episode 1/1 - Score: [-25.30076916]
episode: [3341. 3342. 3348. 3395.]; total steps: 895000; episodes scores: [ 46.46932494 205.45708742 155.9945589  162.94397296]; avg score: 142.71623605450586
learning timestep: 896000
Policy Loss: -0.059974756091833115
Value Loss: 1.985439419746399
Entropy: -3.582726001739502
KL Divergence: 18.77236557006836
episode: [3342. 3344. 3350. 3396.]; total steps: 896000; episodes scores: [315.87562217 -20.68573151  14.1197552  315.27228362]; avg score: 156.14548236915076
episode: [3342. 3345. 3351. 3397.]; total steps: 897000; episodes scores: [315.87562217 316.78292302 173.44648943 149.31922303]; avg score: 238.8560644116557
learning timestep: 898000
Policy Loss: -0.10807856917381287
Value Loss: 1.7477765083312988
Entropy: -3.784639358520508
KL Divergence: 21.261131286621094
episode: [3345. 3346. 3352. 3399.]; total steps: 898000; episodes scores: [-

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3360.0.mp4
episode rendered
Episode 1/1 - Score: [315.80973522]
learning timestep: 910000
Policy Loss: -0.018132606521248817
Value Loss: 0.7848454713821411
Entropy: -3.918271541595459
KL Divergence: 15.33383560180664
episode: [3360. 3362. 3367. 3415.]; total steps: 910000; episodes scores: [ 80.0584432  -79.9060917   31.05810377  96.04182875]; avg score: 31.813071004714832
episode: [3362. 3363. 3368. 3416.]; total steps: 911000; episodes scores: [ 71.90851265 313.59603951  53.80898886 315.46220929]; avg score: 188.69393757917575
learning timestep: 912000
Policy Loss: 0.05033503845334053
Value Loss: 3.610290050506592
Entropy: -3.619378089904785
KL Divergence: 15.034618377685547
episode: [3362. 3364. 3369. 3417.]; total steps: 912000; episodes scores: [ 71.90851265 314.6282142  315.67740681 316.87793223]; avg score: 254.77301647518595
episode: [3363. 3365. 3370. 3420.]; total steps: 913000; episodes scores: [3

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3380.0.mp4
episode rendered
Episode 1/1 - Score: [314.93765283]
episode: [3380. 3384. 3387. 3438.]; total steps: 929000; episodes scores: [ 87.50717285 152.98746075 -47.46608924  61.34447865]; avg score: 63.593255750082974
learning timestep: 930000
Policy Loss: 0.09568861871957779
Value Loss: 8.465245246887207
Entropy: -3.7158150672912598
KL Divergence: 16.94952964782715
episode: [3381. 3386. 3388. 3439.]; total steps: 930000; episodes scores: [-13.25285215 142.34562335 114.1906342  314.99011747]; avg score: 139.5683807161621
episode: [3382. 3387. 3389. 3440.]; total steps: 931000; episodes scores: [314.78091788 193.02194152 315.55608555 314.97528298]; avg score: 284.58355698112416
learning timestep: 932000
Policy Loss: 0.10725779086351395
Value Loss: 6.036057949066162
Entropy: -3.7694549560546875
KL Divergence: 15.070615768432617
episode: [3383. 3387. 3390. 3441.]; total steps: 932000; episodes scores: [144

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3400.0.mp4
episode rendered
Episode 1/1 - Score: [315.49335962]
episode: [3400. 3407. 3407. 3458.]; total steps: 947000; episodes scores: [316.72524026 143.1263531   29.27451047 192.94567492]; avg score: 170.51794468614787
learning timestep: 948000
Policy Loss: -0.07818951457738876
Value Loss: 0.5850096940994263
Entropy: -3.8072075843811035
KL Divergence: 19.95567512512207
episode: [3401. 3408. 3409. 3460.]; total steps: 948000; episodes scores: [313.93339123 316.59829233 117.98504565 -53.0659114 ]; avg score: 173.86270445112635
episode: [3402. 3409. 3409. 3461.]; total steps: 949000; episodes scores: [  4.94323629 315.33594349 117.98504565 315.55622047]; avg score: 188.45511147741794
learning timestep: 950000
Policy Loss: -0.003778832731768489
Value Loss: 1.9230012893676758
Entropy: -3.847870111465454
KL Divergence: 22.7615909576416
episode: [3403. 3410. 3410. 3462.]; total steps: 950000; episodes scores: [

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3420.0.mp4
episode rendered
Episode 1/1 - Score: [314.89965085]
learning timestep: 964000
Policy Loss: 0.08691410720348358
Value Loss: 3.349274158477783
Entropy: -4.062839031219482
KL Divergence: 20.035083770751953
episode: [3421. 3431. 3429. 3482.]; total steps: 964000; episodes scores: [-54.99852448 -33.48058432 -29.77327024  97.713798  ]; avg score: -5.134645262367375
episode: [3422. 3432. 3431. 3483.]; total steps: 965000; episodes scores: [ 87.81923426 315.58086994 -31.52869425 315.71488572]; avg score: 171.89657391613852
learning timestep: 966000
Policy Loss: 0.014023419469594955
Value Loss: 2.3408071994781494
Entropy: -3.7772445678710938
KL Divergence: 19.579517364501953
episode: [3424. 3433. 3432. 3483.]; total steps: 966000; episodes scores: [-42.26073575 315.38895704 156.18209139 315.71488572]; avg score: 186.25629960151196
episode: [3425. 3433. 3433. 3486.]; total steps: 967000; episodes scores: [

                                                                 

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3440.0.mp4
episode rendered
Episode 1/1 - Score: [317.74082209]
episode: [3440. 3447. 3449. 3501.]; total steps: 979000; episodes scores: [317.90687816  18.3218594  -34.92299195 122.59574715]; avg score: 105.97537318938814
learning timestep: 980000
Policy Loss: 0.038586027920246124
Value Loss: 3.8156585693359375
Entropy: -3.9633066654205322
KL Divergence: 20.802865982055664
episode: [3441. 3449. 3450. 3503.]; total steps: 980000; episodes scores: [197.08912431  -6.08862445 137.47727282  97.49558754]; avg score: 106.4933400533734
episode: [3442. 3450. 3451. 3503.]; total steps: 981000; episodes scores: [ 62.7209011  317.81521993 316.63948781  97.49558754]; avg score: 198.66779909335654
learning timestep: 982000
Policy Loss: 0.1340305060148239
Value Loss: 1.9921256303787231
Entropy: -3.9898931980133057
KL Divergence: 22.447311401367188
episode: [3443. 3452. 3452. 3505.]; total steps: 982000; episodes scores: [

                                                               

Moviepy - Done !
Moviepy - video ready BipedalWalker/ppo/renders/train/episode_3460.0.mp4
episode rendered
Episode 1/1 - Score: [182.42518424]
learning timestep: 994000
Policy Loss: 0.10131906718015671
Value Loss: 3.121669292449951
Entropy: -3.8008174896240234
KL Divergence: 22.508853912353516
episode: [3461. 3470. 3468. 3527.]; total steps: 994000; episodes scores: [  75.62734458  316.29980166  202.74966529 -104.46258778]; avg score: 122.55355594026153
episode: [3462. 3471. 3469. 3529.]; total steps: 995000; episodes scores: [  85.47222923  316.37270242   76.05232748 -105.44546949]; avg score: 93.11294740960201
learning timestep: 996000
Policy Loss: -0.004829706624150276
Value Loss: 3.143052101135254
Entropy: -3.8219408988952637
KL Divergence: 17.429058074951172
episode: [3463. 3472. 3472. 3530.]; total steps: 996000; episodes scores: [315.80129728 -34.08754096  -4.74484256 316.22482518]; avg score: 148.2984347353988
episode: [3464. 3474. 3473. 3532.]; total steps: 997000; episodes sc

0,1
action_0,██▆▃▁▄▄▁▅▄▄▁▆▃▃▅▄▅▂▆▃▅▆▃█▁▄▅▄▇▆▄▇▆▆▅▂▇▆█
action_1,▁▅▂▇▅▅▆▃▆▆█▆▄█▅█▅▅▅▇▇▅▅▃█▆▆▅▅▅▃▃▄██▇▃▃▄█
action_2,▅▃▄▇▃▄▆▄▁▅▅▇▁█▂▁▄▄▃▃▅█▆▇▆▆▆▆▆█▅▅▅▆▇▇▅▇▅█
action_3,▅▇▅█▂▂▆▂▁▃▆█▁█▃▇▄▃█▅▁▇▂▁▅▂▂▅▃▃▅██▂▃▂██▄▅
actor_loss,▄▄▅▃▇▆▅▂▄▄█▄▅▆▅▅▅▁▁▄▂▅▂▆▅▆▅▃▄▅▅▅▄▄▃▃▃▆▅▄
advantages,█▁▃▄▂▆█▄▅▄▄▂▂▅▅▅▄▅▄▃▅▃▅▄▄▂▃▅▅▅▅▅▆▅▅▃▅▄▃▆
avg_env_scores,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▂▅▅▂▂▃▄▃▆▄▄█▆▆█▃▆▆▅
critic_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▄▂▁▂▂▄▂▂▁▁▂▁█▇▂
entropy,██▅▅▆▄▅▄▄▄▃▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁
episode,▁▁▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████

0,1
action_0,-0.94963
action_1,0.19428
action_2,-0.39075
action_3,0.54067
actor_loss,-0.05133
advantages,0.0
avg_env_scores,201.81107
best,False
critic_loss,1.78703
entropy,-3.76081


In [None]:
ppo.env.spec

In [None]:
env_vec = ppo._initialize_env(100, 2, 42)

In [None]:
for env in env_vec.envs:
    print(env.spec)

In [None]:
env_id = 'ALE/SpaceInvaders-ram-v5'
env = gym.make(env_id)

In [None]:
env.action_space.n

In [None]:
config_file_path = '/workspaces/RL_Agents/src/app/walker2d/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(config)

In [None]:
walker = PPO.load(config)

In [None]:
humanoid.env

In [None]:
test_data = walker.test(10, render_freq=1)

In [None]:
test_data

In [None]:
np.arange(0.001, 0.101, 0.005)