In [None]:
import numpy as np
import numba
import umap
import pynndescent

print("NumPy version:", np.__version__)
print("Numba version:", numba.__version__)
print("UMAP version:", umap.__version__)
print("PyNNDescent version:", pynndescent.__version__)


In [1]:
import os
import json
import ale_py

import torch as T
import torch.nn as nn
from torch import optim
import numpy as np
# import pandas as pd
# from umap import UMAP


import torch_utils
from torch import distributions

import gymnasium as gym
import gymnasium_robotics as gym_robo
from gymnasium.vector import VectorEnv, SyncVectorEnv
# import models
from models import ValueModel, StochasticContinuousPolicy, ActorModel, StochasticDiscretePolicy
import cnn_models
from rl_agents import PPO, DDPG, Reinforce, ActorCritic#, TD3 HER
import rl_callbacks
from rl_callbacks import WandbCallback
# from helper import Normalizer
import gym_helper
import wandb_support
import wandb
import gym_helper
import dash_utils

# from mpi4py import MPI

In [None]:
import mujoco

In [None]:
mujoco.MjModel

In [None]:
gym_robo.__version__

In [None]:
def check_cuda():
    cuda_available = T.cuda.is_available()
    if cuda_available:
        print("CUDA is available.")
        num_gpus = T.cuda.device_count()
        print(f"Number of GPUs detected: {num_gpus}")
        
        for i in range(num_gpus):
            gpu_name = T.cuda.get_device_name(i)
            gpu_memory = T.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # Convert bytes to GB
            print(f"GPU {i}: {gpu_name}")
            print(f"Total memory: {gpu_memory:.2f} GB")
    else:
        print("CUDA is not available.")

check_cuda()

In [None]:
def get_default_device():
    """Returns the default device for computations, GPU if available, otherwise CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()
print(f"Using device: {device}")

# TEST

In [None]:
gym_robo.register_robotics_envs()

In [None]:
gym.envs.registration.registry

In [None]:
wandb.login(key='758ac5ba01e12a3df504d2db2fec8ba4f391f7e6')

In [None]:
env = gym.make('FetchPush-v2', max_episode_steps=100, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(env, 'test/', episode_trigger=lambda i: i%1==0)

episodes = 10


for episode in range(episodes):
    done = False
    obs, _ = env.reset()
    while not done:
        obs, r, term, trunc, dict = env.step(env.action_space.sample())
        if term or trunc:
            done = True
env.close()

In [None]:
env = gym.make("FetchReach-v2")
env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

# The following always has to hold:
assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
assert truncated == env.compute_truncated(obs["achieved_goal"], obs["desired_goal"], info)
assert terminated == env.compute_terminated(obs["achieved_goal"], obs["desired_goal"], info)

In [None]:
env.compute_reward()

In [None]:
env = gym.make('FetchPush-v2', render_mode='rgb_array')

In [None]:
if hasattr(env, "distance_threshold"):
    print('true')
else:
    print('false')

In [None]:
if env.get_wrapper_attr("distance_threshold"):
    print('true')

In [None]:
print(dir(env))


# DDPG

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize_layers=True)

In [None]:
actor

In [None]:
ddpg_agent.actor_model

In [None]:
ddpg_agent.target_actor_model

In [None]:
# build critic

state_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

merged_layers = [
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers,
                            optimizer='Adam', optimizer_params={'weight_decay':0.01}, learning_rate=0.002, normalize_layers=True)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000)
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
ddpg_agent.critic_model

In [None]:
ddpg_agent.target_critic_model

In [None]:
ddpg_agent.train(100, True, 10)

In [None]:
ddpg_agent.test(10, True, 1)

# Actor Critic

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', "kaiming normal"),
    (256, 'relu', "kaiming normal"),
    ]



In [None]:
policy_model = models.PolicyModel(env=env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001,)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
value_model = models.ValueModel(env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001)

In [None]:
value_model

In [None]:
for params in value_model.parameters():
    print(params)

In [None]:
actor_critic = rl_agents.ActorCritic(env,
                                     policy_model,
                                     value_model,
                                     discount=0.99,
                                     policy_trace_decay=0.5,
                                     value_trace_decay=0.5,
                                     callbacks=[rl_callbacks.WandbCallback('CartPole-v1-Actor-Critic')])

In [None]:
actor_critic.train(200)

In [None]:
actor_critic.test(10, True, 1)

# REINFORCE

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', {
                    "kaiming normal": {
                        "a":1.0,
                        "mode":'fan_in'
                    }
                },
    ),
    # (256, 'relu', {
    #                 "kaiming_normal": {
    #                     "a":0.0,
    #                     "mode":'fan_in'
    #                 }
    #             },
    # )
    ]

In [None]:
dense_layers = [(128, 'relu', "kaiming normal")]

In [None]:
value_model = models.ValueModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in value_model.parameters():
    print(param)

In [None]:
policy_model = models.PolicyModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
reinforce = rl_agents.Reinforce(env, policy_model, value_model, 0.99, [rl_callbacks.WandbCallback('CartPole-v0_REINFORCE', chkpt_freq=100)])

In [None]:
reinforce.train(200, True, 50)

In [None]:
reinforce.test(10, True, 1)

# DDPG w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

In [None]:
cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
cnn

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env, cnn_model=cnn, dense_layers=dense_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=cnn, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape=(1,))
noise = helper.OUNoise(shape=env.action_space.shape, mean=0.0, theta=0.15, sigma=0.01, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(
    env,
    actor,
    critic,
    discount=0.98,
    tau=0.05,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    callbacks=[rl_callbacks.WandbCallback("CarRacing-v2")]
)

In [None]:
ddpg_agent.train(1000, True, 10)

In [None]:
wandb.finish()

In [None]:
wandb.login()

# HER

In [None]:
env = gym.make("Reacher-v4")

In [None]:
_,_ = env.reset()

In [None]:
achieved_goal = gym_helper.reacher_achieved_goal(env)
action = env.action_space.sample()
env.step(action)
print(f'observation: {env.get_wrapper_attr("_get_obs")()}')
print(f'distance to goal: {env.get_wrapper_attr("_get_obs")()[8::]}')
print(f'fingertip: {env.get_wrapper_attr("get_body_com")("fingertip")}')
print(f'target: {env.get_wrapper_attr("get_body_com")("target")}')

In [None]:
next_achieved_goal = env.get_wrapper_attr("_get_obs")()[8::]
desired_goal = [0.0, 0.0, 0.0]

In [None]:
reward_func(env, action, achieved_goal, next_achieved_goal, desired_goal, 0.05)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=None,
                          dense_layers=dense_layers,
                          goal_shape=(3,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.0001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=None,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(3,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.0001,
                            normalize=False)

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Reacher-v4')])

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=0.001,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.train(10, 50, 16, 40, True, 1000)

In [None]:
wandb.finish()

In [None]:
her.test(10, True, 1)

In [None]:
her.save()

In [None]:
her.agent.goal_normalizer.running_std

In [None]:
loaded_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
loaded_her.agent.replay_buffer.sample(10)

In [None]:
loaded_her.agent.state_normalizer.running_cnt

In [None]:
loaded_her.get_config()

In [None]:
loaded_her.test(10, True, 1)

In [None]:
10e4

# HER w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
_,_ = env.reset()

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal(env).shape

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=cnn,
                          dense_layers=dense_layers,
                          goal_shape=(1,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=cnn,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(1,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.001,
                            normalize=False)

In [None]:
critic

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('CarRacing-v2')])

In [None]:
ddpg_agent.actor_model

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=1,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.agent.actor_model

In [None]:
her.train(num_epochs=20,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=20
        )

In [None]:
her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/models/her")

In [None]:
wandb.finish()

In [None]:
# reset environment
state, _ = her.agent.env.reset()
# instantiate empty lists to store current episode trajectory
states, actions, next_states, dones, state_achieved_goals, \
next_state_achieved_goals, desired_goals = [], [], [], [], [], [], []
# set desired goal
desired_goal = her.desired_goal_func(her.agent.env)
# set achieved goal
state_achieved_goal = her.achieved_goal_func(her.agent.env)
# add initial state and goals to local normalizer stats
her.state_normalizer.update_local_stats(state)
her.goal_normalizer.update_local_stats(desired_goal)
her.goal_normalizer.update_local_stats(state_achieved_goal)
# set done flag
done = False
# reset episode reward to 0
episode_reward = 0
# reset steps counter for the episode
episode_steps = 0

while not done:
    # get normalized values for state and desired goal
    state_norm = her.state_normalizer.normalize(state)
    desired_goal_norm = her.goal_normalizer.normalize(desired_goal)
    # get action
    action = her.agent.get_action(state_norm, desired_goal_norm, grad=False)
    # take action
    next_state, reward, term, trunc, _ = her.agent.env.step(action)
    # get next state achieved goal
    next_state_achieved_goal = her.achieved_goal_func(her.agent.env)
    # add next state and next state achieved goal to normalizers
    her.state_normalizer.update_local_stats(next_state)
    her.goal_normalizer.update_local_stats(next_state_achieved_goal)
    # store trajectory in replay buffer (non normalized!)
    her.agent.replay_buffer.add(state, action, reward, next_state, done,\
                                    state_achieved_goal, next_state_achieved_goal, desired_goal)
    
    # append step state, action, next state, and goals to respective lists
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    dones.append(done)
    state_achieved_goals.append(state_achieved_goal)
    next_state_achieved_goals.append(next_state_achieved_goal)
    desired_goals.append(desired_goal)

    # add to episode reward and increment steps counter
    episode_reward += reward
    episode_steps += 1
    # update state and state achieved goal
    state = next_state
    state_achieved_goal = next_state_achieved_goal
    # update done flag
    if term or trunc:
        done = True

In [None]:
# package episode states, actions, next states, and goals into trajectory tuple
trajectory = (states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)

In [None]:
states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals = trajectory

In [None]:
for idx, (s, a, ns, d, sag, nsag, dg) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):
    print(f'a={a}, d={d}, sag={sag}, nsag={nsag}, dg={dg}')

In [None]:
strategy = "future"
num_goals = 4

# loop over each step in the trajectory to set new achieved goals, calculate new reward, and save to replay buffer
for idx, (state, action, next_state, done, state_achieved_goal, next_state_achieved_goal, desired_goal) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):

    if strategy == "final":
        new_desired_goal = next_state_achieved_goals[-1]
        new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
        print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
        her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)

    if strategy == 'future':
        for i in range(num_goals):
            if idx + i + 1 >= len(states):
                break
            goal_idx = np.random.randint(idx + 1, len(states))
            new_desired_goal = next_state_achieved_goals[goal_idx]
            new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
            print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
            her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)
    

    


In [None]:
s, a, r, ns, d, sag, nsag, dg = her.agent.replay_buffer.sample(100)

In [None]:
for i in range(100):
    print(f'{i}: a={a[i]}, r={r[i]}, d={d[i]}, sag={sag[i]}, nsag={nsag[i]}, dg={dg[i]} ')

# HER Pendulum

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.001, normalize=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, (3,))
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
def desired_goal_func(env):
    return np.array([0.0, 0.0, 0.0])

def achieved_goal_func(env):
    return env.get_wrapper_attr('_get_obs')()

def reward_func(env):
    pass

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='none',
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=10.0
)

In [None]:
her.agent.critic_model

In [None]:
her.agent.target_critic_model

In [None]:
her.train(1,1,100,1)

In [None]:
wandb.finish()

In [None]:
state = env.observation_space.sample()
state

In [None]:
her.agent.state_normalizer.normalize(state)

In [None]:
goal = her.desired_goal_func(her.agent.env)
goal

In [None]:
her.agent.goal_normalizer.normalize(goal)

In [None]:
def remove_renders(folder_path):
    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .mp4 or .meta.json extension
        if filename.endswith(".mp4") or filename.endswith(".meta.json"):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Remove the file
            os.remove(file_path)

In [None]:
remove_renders("/workspaces/RL_Agents/pytorch/src/app/assets/models/ddpg/renders/training")

# HER Fetch-Reach (Robotics)

In [None]:
env = gym.make("FetchReach-v2", max_episode_steps=50)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
achieved_goal_func(env)

In [None]:
env.get_wrapper_attr("_get_obs")()

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
goal_shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchReach-v2")])

In [None]:
ddpg_agent.critic_model

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

In [None]:
states, action, rewards, next_states, dones, achieved_goals, next_achieved_goals, desired_goals = her.agent.replay_buffer.sample(2)

In [None]:
desired_goals

In [None]:
her.agent.env.get_wrapper_attr("distance_threshold")

In [None]:
# get success
her.agent.env.get_wrapper_attr("_is_success")(achieved_goal_func(her.agent.env), desired_goal_func(her.agent.env))

In [None]:
her.agent.env.get_wrapper_attr("goal_distance")(next_state_achieved_goal, desired_goal, None)

In [None]:
pusher_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
pusher_her.agent.env.reset()

In [None]:
pusher_her.get_config()

In [None]:
wandb.finish()

In [None]:
np.linalg.norm(pusher_her.agent.env.get_wrapper_attr("get_body_com")("goal") - pusher_her.agent.env.get_wrapper_attr("get_body_com")("object"))

In [None]:
pusher_her.agent.replay_buffer.get_config()

In [None]:

pusher_her.agent.replay_buffer.desired_goals

In [None]:
## TEST ENV
env = gym.make("Pusher-v5", render_mode="rgb_array")

In [None]:
env = gym.wrappers.RecordVideo(
                    env,
                    "/renders/training",
                    episode_trigger=lambda x: True,
                )


In [None]:
state, _ = env.reset()

for i in range(1000):
# take action
    next_state, reward, term, trunc, _ = env.step(env.action_space.sample())
env.close()

# HER Fetch Push (Robitics)

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

# TESTING MULTITHREADING

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    num_workers=4,
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train()

# TESTING

In [None]:
# load config
config_path = "/workspaces/RL_Agents/pytorch/src/app/HER_Test/her/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

In [None]:
config

In [None]:
agent = rl_agents.HER.load(config)

In [None]:
for callback in agent.agent.callbacks:
    print(callback._sweep)

# Co Occurence

In [None]:
import subprocess

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/wandb_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    wandb_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(wandb_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Save the updated configuration to a train config file
os.makedirs('sweep', exist_ok=True)
train_config_path = os.path.join(os.getcwd(), 'sweep/train_config.json')
with open(train_config_path, 'w') as f:
    json.dump(sweep_config, f)

# Save and Set the sweep config path
sweep_config_path = os.path.join(os.getcwd(), 'sweep/sweep_config.json')
with open(sweep_config_path, 'w') as f:
    json.dump(wandb_config, f)

In [None]:
command = ['python', 'sweep.py']

# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

subprocess.Popen(command)

In [None]:
# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/train_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    train_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(train_config)

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, project=sweep_config["project"])
# loop over num wandb agents
num_agents = 1
# for agent in range(num_agents):
wandb.agent(
    sweep_id,
    function=lambda: wandb_support._run_sweep(sweep_config, train_config,),
    count=train_config['num_sweeps'],
    project=sweep_config["project"],
)

In [None]:
sweep_config

# PPO

In [None]:
from pathlib import Path
from typing import List, Tuple
import torch.nn.functional as F
from torch.distributions import Categorical, Beta, Normal, kl_divergence
import time
import cv2

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.1
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid1 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_1 = ppo_agent_hybrid1.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.01
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
## PARAMS ##
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
# env_id = 'BipedalWalker-v3'
env_id = 'Humanoid-v5'
# env_id = "Reacher-v5"
# env_id = "Walker2d-v5"
# env_id = 'ALE/SpaceInvaders-ram-v5'
# env_id = "CarRacing-v2"
# env_id = "BipedalWalkerHardcore-v3"

timesteps = 1_000_000
trajectory_length = 2000
batch_size = 64
learning_epochs = 10
num_envs = 16
policy_lr = 3e-4
value_lr = 2e-5
policy_clip = 0.2
entropy_coeff = 0.001
loss = 'hybrid'
kl_coeff = 0.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
grad_clip = 40.0
reward_clip = 1.0
lambda_ = 0.0
distribution = 'beta'
device = 'cuda'

# Render Settings
render_freq = 100

## WANDB ##
project_name = 'Humanoid-v5'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42
env = gym.make(env_id)

save_dir = 'Humanoid'
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed

# Build policy model
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
layer_config = [
    # {'type': 'cnn', 'params': {'out_channels': 32, 'kernel_size': (8, 8), 'stride': 4, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (4, 4), 'stride': 2, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (3, 3), 'stride': 1, 'padding': 0}},
    # {'type': 'flatten'},
    {'type': 'dense', 'params': {'units': 128, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
]
output_layer_kernel = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticContinuousPolicy(env, layer_config, output_layer_kernel, learning_rate=policy_lr, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer_kernel, learning_rate=value_lr, device=device)
ppo = PPO(env, policy, value_function, distribution=distribution, discount=0.99, gae_coefficient=0.95, policy_clip=policy_clip, entropy_coefficient=entropy_coeff,
          loss=loss, kl_coefficient=kl_coeff, normalize_advantages=normalize_advantages, normalize_values=normalize_values, value_normalizer_clip=norm_clip, policy_grad_clip=grad_clip,
          reward_clip=reward_clip, lambda_=lambda_, callbacks=callbacks, save_dir=save_dir,device=device)
hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)
# ppo.test(10,"ppo_test", 1)


In [23]:
config_file_path = '/workspaces/RL_Agents/src/app/pong_v5_3/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

In [None]:
config['wrappers']

In [25]:
pong = PPO.load(config, False)

In [None]:
pong.get_config()

In [None]:
pong.train(2000000, 128, 32, 3, 12, 42)

In [15]:
scores = np.zeros(4)

In [None]:
scores[1] = 1
scores

In [47]:
import gymnasium.wrappers as base_wrappers

WRAPPER_REGISTRY = {
    "AtariPreprocessing": {
        "cls": base_wrappers.AtariPreprocessing,
        "default_params": {
            "frame_skip": 1,
            "grayscale_obs": True,
            "scale_obs": True
        }
    },
    "TimeLimit": {
        "cls": base_wrappers.TimeLimit,
        "default_params": {
            "max_episode_steps": 1000
        }
    },
    "TimeAwareObservation": {
        "cls": base_wrappers.TimeAwareObservation,
        "default_params": {
            "flatten": False,
            "normalize_time": False
        }
    },
    "FrameStackObservation": {
        "cls": base_wrappers.FrameStackObservation,
        "default_params": {
            "stack_size": 4
        }
    },
    "ResizeObservation": {
        "cls": base_wrappers.ResizeObservation,
        "default_params": {
            "shape": 84
        }
    }
}

In [48]:
wrappers = [
    {'type': "AtariPreprocessing", 'params': {'frame_skip':1, 'grayscale_obs':True, 'scale_obs':True}},
    {'type': "FrameStackObservation", 'params': {'stack_size':4}},
]

In [49]:
def wrap_env(vec_env, wrappers):
    wrapper_list = []
    for wrapper in wrappers:
        if wrapper['type'] in WRAPPER_REGISTRY:
            print(f'wrapper type:{wrapper["type"]}')
            # Use a copy of default_params to avoid modifying the registry
            default_params = WRAPPER_REGISTRY[wrapper['type']]["default_params"].copy()
            
            if wrapper['type'] == "ResizeObservation":
                # Ensure shape is a tuple for ResizeObservation
                default_params['shape'] = (default_params['shape'], default_params['shape']) if isinstance(default_params['shape'], int) else default_params['shape']
            
            print(f'default params:{default_params}')
            override_params = wrapper.get("params", {})
            
            if wrapper['type'] == "ResizeObservation":
                # Ensure override_params shape is a tuple
                if 'shape' in override_params:
                    override_params['shape'] = (override_params['shape'], override_params['shape']) if isinstance(override_params['shape'], int) else override_params['shape']
            
            print(f'override params:{override_params}')
            final_params = {**default_params, **override_params}
            print(f'final params:{final_params}')
            
            def wrapper_factory(env, cls=WRAPPER_REGISTRY[wrapper['type']]["cls"], params=final_params):
                return cls(env, **params)
            
            wrapper_list.append(wrapper_factory)
    
    # Define apply_wrappers outside the loop
    def apply_wrappers(env):
        for wrapper in wrapper_list:
            env = wrapper(env)
            print(f'length of obs space:{len(env.observation_space.shape)}')
            print(f'env obs space shape:{env.observation_space.shape}')
        return env
    
    print(f'wrapper list:{wrapper_list}')
    envs = [lambda: apply_wrappers(gym.make(vec_env.spec.id, render_mode="rgb_array")) for _ in range(vec_env.num_envs)]    
    return SyncVectorEnv(envs)

In [None]:
vec_env = gym.make_vec("ALE/Pong-v5", render_mode="rgb_array", num_envs=8)
wrapped_vec = wrap_env(vec_env, wrappers)

In [None]:
wrapped_vec.single_observation_space

In [None]:
for env in wrapped_vec.envs:
    print(env.spec)

In [141]:
def format_wrappers(wrapper_store):
    wrappers_dict = {}
    for key, value in wrapper_store.items():
        # Split the key into wrapper type and parameter name
        parts = key.split('_param:')
        print(f'parts:{parts}')
        wrapper_type = parts[0].split('wrapper:')[1]
        print(f'wrapper_type:{wrapper_type}')
        param_name = parts[1]
        print(f'param name:{param_name}')
        
        # If the wrapper type already exists in the dictionary, append to its params
        if wrapper_type not in wrappers_dict:
            wrappers_dict[wrapper_type] = {'type': wrapper_type, 'params': {}}
        
        wrappers_dict[wrapper_type]['params'][param_name] = value
    
    # Convert the dictionary to a list of dictionaries
    formatted_wrappers = list(wrappers_dict.values())
    
    return formatted_wrappers

In [142]:
wrapper_params = {'wrapper:AtariPreprocessing_param:frame_skip': 1, 'wrapper:AtariPreprocessing_param:grayscale_obs': True, 'wrapper:AtariPreprocessing_param:scale_obs': True, 'wrapper:FrameStackObservation_param:stack_size': 4}

In [None]:
formatted_wrappers = format_wrappers(wrapper_params)

In [None]:
formatted_wrappers

In [None]:
wrapper_params = {'wrapper:AtariPreprocessing_param:frame_skip': 1, 'wrapper:AtariPreprocessing_param:grayscale_obs': True, 'wrapper:AtariPreprocessing_param:scale_obs': True, 'wrapper:FrameStackObservation_param:stack_size': 4}
formatted_wrappers = dash_utils.format_wrappers(wrapper_params)
#DEBUG
print(f'formatted wrappers:{formatted_wrappers}')
env = dash_utils.instantiate_envwrapper_obj("gymnasium", "ALE/Pong-v5", formatted_wrappers)

In [2]:
config_file_path = '/workspaces/RL_Agents/src/app/lunarlander_1/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)
ppo = PPO.load(config, False)

In [None]:
ppo.get_config()

In [None]:
states, _ = pong.env.reset()
states.shape

In [16]:
ns, r, term, trunc, _ = pong.env.step(pong.env.action_space.sample())

In [None]:
r.shape

In [None]:
pong.env.single_observation_space.shape

In [None]:
pong.env.observation_space.shape

In [None]:
pong.env.env.envs[0].spec

In [None]:
states, _ = pong.env.reset()
states = T.tensor(states)
dist, _ = pong.policy_model(states)
sample = dist.sample()
sample.shape

In [None]:
pong.policy_model

In [3]:
ppo.train(100000, 2048, 64, 20, 8, 42, render_freq=20)

[34m[1mwandb[0m: Currently logged in as: [33mjasonhayes1987[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


Rendering episode 0.0 during training...
rendering episode...
Moviepy - Building video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_0.0.mp4.
Moviepy - Writing video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_0.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_0.0.mp4
episode rendered
Episode 1/1 - Score: [-264.04672853]
episode: [ 9. 10. 10.  9. 12.  9. 10. 10.]; total steps: 1000; episodes scores: [-125.718143   -125.64902786 -244.5481212  -156.60258407  -71.261457
 -250.50611558 -452.14999094 -267.90229707]; avg score: -207.52220687749463
episode: [13. 21. 19. 19. 22. 20. 20. 21.]; total steps: 2000; episodes scores: [-331.93419116  -73.57136009 -309.83673772 -179.95051972 -107.06462056
 -354.41640481  -45.74170813 -163.15336158]; avg score: -207.11608290835963
Rendering episode 20.0 during training...
rendering episode...
Moviepy - Building video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_20.0.mp4.
Moviepy - Writing video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_20.0.mp4



                                                              

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_20.0.mp4
episode rendered
Episode 1/1 - Score: [-175.01025664]
episode: [20. 31. 29. 28. 30. 30. 30. 30.]; total steps: 3000; episodes scores: [ -83.45319756 -115.88375522 -195.1419367  -118.81413148  -90.67989228
 -103.98883127  -95.4198263  -208.34883311]; avg score: -138.419621317167
episode: [29. 42. 40. 37. 40. 40. 39. 40.]; total steps: 4000; episodes scores: [-309.43040551  -64.62900845 -119.70998001  -57.06469459  -72.94410033
 -106.77781722 -113.02304465 -112.65605935]; avg score: -116.92899652808522
episode: [39. 51. 48. 48. 50. 49. 48. 49.]; total steps: 5000; episodes scores: [-64.96892685 -53.00400644 -44.57787577 -64.6247051  -78.29947243
 -42.89206941 -30.70656524 -67.1829243 ]; avg score: -93.56086600257328
Rendering episode 40.0 during training...
rendering episode...
Moviepy - Building video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_40.0.mp4
episode rendered
Episode 1/1 - Score: [-114.41297677]
episode: [48. 60. 57. 57. 59. 58. 57. 57.]; total steps: 6000; episodes scores: [ -83.49327389  -65.921227    -55.27770073  -39.67783529  -59.14519598
  -52.55314851  -87.29976626 -109.17989813]; avg score: -73.74954349398914
episode: [56. 67. 65. 65. 66. 65. 65. 65.]; total steps: 7000; episodes scores: [-40.51027327 -91.6228463  -54.89690749 -65.44257541 -54.80766018
 -66.2925912  -63.89220017 -55.03817218]; avg score: -68.07501921281325
Rendering episode 60.0 during training...
rendering episode...
Moviepy - Building video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_60.0.mp4.
Moviepy - Writing video /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_60.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_60.0.mp4
episode rendered
Episode 1/1 - Score: [-38.29248514]
episode: [63. 76. 72. 72. 73. 72. 73. 72.]; total steps: 8000; episodes scores: [-60.3493451  -87.18762126 -47.63735811 -69.23857036 -80.91623063
 -36.77801252 -35.6942297  -78.74723649]; avg score: -50.1085108811804
episode: [64. 82. 79. 79. 81. 78. 78. 77.]; total steps: 9000; episodes scores: [-83.50624983  -3.60770525 -84.70233656 -64.05617256 -44.80997868
 -44.29257978  30.18337917 -40.87018239]; avg score: -45.80214091205133
episode: [71. 87. 85. 84. 87. 84. 83. 83.]; total steps: 10000; episodes scores: [ -70.48645097 -108.41401946  -36.60729287 -108.19759579  -18.84650431
    1.89607563  -33.37806921  -74.82854212]; avg score: -40.79651217099364
episode: [76. 89. 87. 88. 90. 87. 86. 87.]; total steps: 11000; episodes scores: [ -43.68914677  -37.51759935   32.60536667 -140.10923934  -75.06177829
  -90.80361    

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_80.0.mp4
episode rendered
Episode 1/1 - Score: [1.79253689]
episode: [81. 95. 93. 95. 94. 90. 89. 92.]; total steps: 13000; episodes scores: [-127.24280018  -88.26731864 -241.84499005  -53.9188915   -48.59839702
   95.38162639   17.98685549 -100.00867596]; avg score: -46.45361539155246
episode: [84. 97. 95. 96. 95. 91. 90. 93.]; total steps: 14000; episodes scores: [   8.29291406   38.40575549  -28.21310226   69.80016478  128.41911141
 -122.27241988 -215.3087904    34.80113881]; avg score: -44.0487477313833
episode: [85. 98. 96. 97. 97. 92. 92. 94.]; total steps: 15000; episodes scores: [ -48.45490349   68.28231754   93.80638073   -6.35480583  -74.92299264
 -219.42171066 -265.17217063  -38.73074014]; avg score: -45.0991253255632
episode: [86. 99. 97. 98. 99. 93. 93. 95.]; total steps: 16000; episodes scores: [ -85.59727108   -2.7810681    55.76477373  -33.67241458  -21.68335665


                                                                

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_100.0.mp4
episode rendered
Episode 1/1 - Score: [130.81487923]
episode: [100. 115. 112. 111. 113. 107. 108. 108.]; total steps: 29000; episodes scores: [138.48223131 114.98738425 137.15462665 133.81754347 194.34385241
 126.426363    83.86869733 103.72655284]; avg score: 68.85444704558824
episode: [101. 116. 114. 112. 114. 108. 110. 109.]; total steps: 30000; episodes scores: [-53.09411463 101.76335867 -35.43805911 118.36026405 132.07114618
 100.16774616 -20.00752872 108.6925551 ]; avg score: 70.97800014953826
episode: [102. 117. 115. 113. 115. 109. 111. 110.]; total steps: 31000; episodes scores: [ 95.08022031 113.76467223 126.33853037  91.76311873  98.31100835
 109.97263038 149.21592408  86.75737704]; avg score: 74.81840696611197
episode: [103. 118. 117. 114. 116. 110. 112. 111.]; total steps: 32000; episodes scores: [ 82.51015523 134.17003711  23.45528625 165.87134683  80.6776

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_120.0.mp4
episode rendered
Episode 1/1 - Score: [251.67730293]
episode: [120. 139. 136. 131. 134. 128. 131. 128.]; total steps: 43000; episodes scores: [250.68418418 203.83504737  29.96139754 263.23861347 236.32941106
 240.19154724 230.28043824 251.13350349]; avg score: 208.93939446664584
episode: [122. 141. 138. 133. 135. 131. 133. 129.]; total steps: 44000; episodes scores: [194.900169   241.66833313 246.49555829 197.61918046 273.14426755
 208.85456572 221.19008967 280.53394483]; avg score: 217.24628584245065
episode: [124. 143. 140. 134. 137. 133. 135. 132.]; total steps: 45000; episodes scores: [244.39311982 310.09782221 243.91312855   5.59541293 242.49254935
 267.16246202 207.71920176 230.33690126]; avg score: 220.54924003661327
episode: [126. 145. 142. 135. 139. 135. 138. 134.]; total steps: 46000; episodes scores: [223.15829294 228.78275134 207.37076488 198.42128291 205.2

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_140.0.mp4
episode rendered
Episode 1/1 - Score: [251.4766774]
episode: [140. 158. 156. 151. 153. 148. 152. 147.]; total steps: 52000; episodes scores: [237.09571899 231.10705613  55.84602671 234.50968907 283.7212514
 239.1540791  248.2120341  262.44181133]; avg score: 235.09266921794983
episode: [142. 161. 158. 152. 155. 150. 154. 149.]; total steps: 53000; episodes scores: [256.46211462  40.54154493 231.41943286 246.67292806 236.92466913
 240.9011051  213.210579   261.16660635]; avg score: 231.8517660298383
episode: [144. 162. 161. 155. 158. 153. 155. 152.]; total steps: 54000; episodes scores: [241.67366118 262.43375902 252.42393396 204.39068252 257.80385883
 232.82138452 212.04609748  -3.62656313]; avg score: 229.03931517530054
episode: [147. 165. 163. 157. 161. 154. 157. 154.]; total steps: 55000; episodes scores: [248.5070654  280.5997667  228.73462545 266.06120236  42.7026

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_160.0.mp4
episode rendered
Episode 1/1 - Score: [257.67518651]
episode: [162. 177. 175. 170. 173. 166. 169. 168.]; total steps: 60000; episodes scores: [ 32.35201339 235.11599727 296.08870362 279.34805126 247.21562267
 264.70676083 220.17596049 267.4934611 ]; avg score: 191.95894555887926
episode: [164. 180. 177. 172. 176. 169. 171. 171.]; total steps: 61000; episodes scores: [239.83519621  -3.91481533 216.70173067 257.34207059 216.89871494
 257.14019092 276.05806167 224.40602989]; avg score: 203.3259556239829
episode: [166. 181. 180. 175. 179. 171. 174. 174.]; total steps: 62000; episodes scores: [249.57598676 284.9816665  265.2375774  228.89043959 240.09803475
 232.9404411  251.75909265 266.91861137]; avg score: 210.96942165738346
episode: [169. 184. 182. 176. 182. 174. 177. 176.]; total steps: 63000; episodes scores: [232.18614745  72.42966742 258.24550918 270.94214856 256.02

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_180.0.mp4
episode rendered
Episode 1/1 - Score: [246.06816381]
episode: [180. 197. 193. 188. 192. 186. 188. 189.]; total steps: 68000; episodes scores: [230.5552876  229.28231866 231.44070043 233.45920079 229.14170853
 247.20885382 260.87572091 263.33767787]; avg score: 240.9653575703613
episode: [182. 199. 196. 190. 194. 188. 190. 191.]; total steps: 69000; episodes scores: [-13.39619177  25.00550118 279.49125377  41.44282338 236.28505773
 207.01866674 239.0788562  254.32138382]; avg score: 239.42846335813044
episode: [185. 202. 198. 193. 197. 191. 193. 194.]; total steps: 70000; episodes scores: [291.74601729 268.99838308  51.09291816 254.6174871  234.84848659
 255.25931625  60.24845164 278.97621353]; avg score: 236.815364848539
episode: [188. 204. 201. 195. 200. 192. 195. 196.]; total steps: 71000; episodes scores: [258.63833164 269.77947949 218.38662064 260.74067617 251.3332

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_200.0.mp4
episode rendered
Episode 1/1 - Score: [243.64182647]
episode: [201. 216. 211. 210. 211. 202. 207. 210.]; total steps: 76000; episodes scores: [219.84079072 264.9419306   26.57008108 258.72938412 249.89474071
 281.22741674  19.77994987 241.09799109]; avg score: 232.91159379274467
episode: [204. 219. 213. 212. 214. 205. 210. 213.]; total steps: 77000; episodes scores: [212.48958996 241.18364477 217.1238264  258.26793177 292.38772536
 295.28585287 280.03349033 256.66366534]; avg score: 242.90649200547384
episode: [207. 221. 215. 215. 216. 208. 212. 216.]; total steps: 78000; episodes scores: [281.72076801 292.63478341 210.58267734 276.06467973 240.86612957
 265.81488408 214.66535036 246.70925237]; avg score: 241.89530747720465
episode: [209. 224. 218. 218. 219. 210. 215. 219.]; total steps: 79000; episodes scores: [260.94830811 230.94443851 241.18795143 225.72570496  24.7

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_220.0.mp4
episode rendered
Episode 1/1 - Score: [251.86887034]
episode: [221. 235. 229. 226. 230. 219. 226. 228.]; total steps: 83000; episodes scores: [260.527985    14.54963012 296.943617   268.62240584 271.53758309
 283.82927612 249.68092489 257.88342145]; avg score: 228.89137002866715
episode: [224. 238. 230. 230. 233. 221. 229. 231.]; total steps: 84000; episodes scores: [269.89693712 255.6466537   91.4527464  291.51117893 285.71473921
 299.44841432 257.15391785 218.88153275]; avg score: 235.42061797534848
episode: [226. 239. 233. 233. 235. 223. 231. 234.]; total steps: 85000; episodes scores: [251.81504482 260.55445614 259.54219255 280.47130701 262.30609384
 284.40830996 290.68911487   9.51872682]; avg score: 234.2956731597737
episode: [229. 241. 236. 236. 237. 226. 234. 237.]; total steps: 86000; episodes scores: [262.60196158  28.86042122  -5.82918356 277.47117526 260.18

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_240.0.mp4
episode rendered
Episode 1/1 - Score: [249.51476469]
episode: [242. 256. 252. 249. 248. 239. 249. 251.]; total steps: 91000; episodes scores: [ 40.37647325 262.88708994 290.94910183 258.21776925 250.59394894
 206.87462318 282.12665311 281.85693834]; avg score: 236.5243990374791
episode: [246. 259. 255. 252. 250. 242. 251. 254.]; total steps: 92000; episodes scores: [ -7.13283258 246.06345376 255.71207415  18.7926046  259.65879282
 240.19661257 268.58291741 245.58278733]; avg score: 236.07365818875329
episode: [249. 262. 258. 256. 252. 245. 254. 258.]; total steps: 93000; episodes scores: [244.91605797 254.55226418 304.33502212 282.02955851 226.40003083
  -1.0662275  272.68924476 240.60672356]; avg score: 238.64962420456578
episode: [252. 265. 261. 258. 253. 248. 257. 259.]; total steps: 94000; episodes scores: [ 38.02616723  23.67905015 241.41480531 239.39776428 106.70

                                                               

Moviepy - Done !
Moviepy - video ready /workspaces/RL_Agents/src/app/lunarlander_1/ppo/renders/train/episode_260.0.mp4
episode rendered
Episode 1/1 - Score: [255.56772427]
episode: [262. 275. 268. 268. 263. 258. 266. 268.]; total steps: 97000; episodes scores: [237.28895963 241.71734701 261.10758564 225.53202166 257.93498734
 286.1106039  256.56102987 258.72779709]; avg score: 238.86017921233008
episode: [265. 278. 271. 272. 267. 262. 270. 272.]; total steps: 98000; episodes scores: [229.81446723 259.75201592 262.56688778 258.76711787  15.74546369
 283.17310178  63.94419207 287.79880199]; avg score: 236.34127353168694
episode: [268. 282. 274. 275. 270. 265. 273. 275.]; total steps: 99000; episodes scores: [277.23285414 288.4933288  264.79957913 301.17213921 284.78295304
 269.3001416  286.43584702 273.32904304]; avg score: 237.35354094671237
episode: [270. 285. 277. 279. 274. 268. 276. 278.]; total steps: 100000; episodes scores: [285.65841912 283.69976266 239.29778222 291.01350049 288.

0,1
actor_loss,▅▅▆▆█▄▃▃▃▂▆▆▄█▁▅▄▄▄▃█▄▃▅▅▄▃▃▂▇▄▇▅▆▄▁▆▆▅▄
advantages,▂▃▄▄█▁▄▃▂▆▆▅▅▅▃▅▃▃▄▄▅▅▄▄▄▃▃▆▄▄▃▅▄▄▅▅▃▄▄▄
critic_loss,██▆▃▂▂▂▂▁▁▁▁▁▁▂▁▃▁▂▁▁▁▂▂▂▁▁▁▄▂▁▁▃▂▁▃▁▁▁▁
entropy,██▇▇▆▆▆▆▆▆▅▅▄▄▃▄▃▄▄▄▃▄▄▄▃▃▂▃▃▃▂▃▂▃▃▂▂▂▂▁
entropy_coefficient,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
episode,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
episode_length,▁▁▁▁▁▁▂▁▃▂▅████▃█▆▆█▃▆▄▄▅▄▃▄▄▃▄▄▃▃▃▃▃▄▂▂
episode_reward,▃▁▁▁▃▅▄▃▄▄▅▄▅▃▃▃▇█▃▇▇▇▃▃▇▆▇▇█▇█▇▇▇▇█▇▅█▇
kl_coefficient,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
kl_divergence,█▇▇▇▇▄▃▃▃▃▃▄▃▆▂▅▂▂▄▂▁▂▃▃▁▃▁▂▄▃▂▄▂▃▂▂▂▃▆▃

0,1
actor_loss,-0.0916
advantages,-0.0
best,False
critic_loss,0.65004
entropy,0.49171
entropy_coefficient,0.001
episode,48
episode_length,261
episode_reward,262.58576
kl_coefficient,0


In [None]:
pong.env.reset()