In [None]:
import numpy as np
import numba
import umap
import pynndescent

print("NumPy version:", np.__version__)
print("Numba version:", numba.__version__)
print("UMAP version:", umap.__version__)
print("PyNNDescent version:", pynndescent.__version__)


In [1]:
import os
import json
import ale_py

import ray
import torch as T
import torch.nn as nn
from torch import optim
import numpy as np
import pandas as pd
# from umap import UMAP
from collections import deque

from torch_utils import get_device, move_to_device, verify_device
from torch import distributions

import gymnasium as gym
import gymnasium_robotics
from gymnasium.vector import VectorEnv, SyncVectorEnv
# import models
from models import ValueModel, StochasticContinuousPolicy, ActorModel, CriticModel, StochasticDiscretePolicy
from rl_agents import PPO, DDPG, Reinforce, ActorCritic, TD3, HER
import rl_callbacks
from rl_callbacks import WandbCallback
# from helper import Normalizer
from buffer import ReplayBuffer, PrioritizedReplayBuffer
from noise import NormalNoise
import gym_helper
import wandb_support
import wandb
import gym_helper
import dash_utils
from env_wrapper import EnvWrapper, GymnasiumWrapper
from schedulers import ScheduleWrapper
from distributed_trainer import DistributedAgents

import matplotlib.pyplot as plt


In [None]:
print(f'mujoco version: {mujoco.__version__}')

In [2]:
env = gym.make('FetchReach-v4')
env_spec = env.spec
wrap_env = GymnasiumWrapper(env_spec)

In [None]:
state, _ = env.reset()

In [None]:
env.env.env.env.initial_qpos

In [None]:
wrap_env.env = wrap_env._initialize_env(num_envs=8)

In [None]:
states, _ = wrap_env.reset()

In [None]:
states

In [None]:
mujoco.MjModel

In [None]:
gym_robo.__version__

In [None]:
def check_cuda():
    cuda_available = T.cuda.is_available()
    if cuda_available:
        print("CUDA is available.")
        num_gpus = T.cuda.device_count()
        print(f"Number of GPUs detected: {num_gpus}")
        
        for i in range(num_gpus):
            gpu_name = T.cuda.get_device_name(i)
            gpu_memory = T.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # Convert bytes to GB
            print(f"GPU {i}: {gpu_name}")
            print(f"Total memory: {gpu_memory:.2f} GB")
    else:
        print("CUDA is not available.")

check_cuda()

In [None]:
def get_default_device():
    """Returns the default device for computations, GPU if available, otherwise CPU"""
    if T.cuda.is_available():
        return T.device('cuda')
    else:
        return T.device('cpu')

device = get_default_device()
print(f"Using device: {device}")

# TEST

In [3]:
gym_robo.register_robotics_envs()

In [2]:
gym.register_envs(gymnasium_robotics)

In [None]:
gym.envs.registration.registry

In [None]:
wandb.login(key='758ac5ba01e12a3df504d2db2fec8ba4f391f7e6')

In [None]:
env = gym.make('FetchPush-v2', max_episode_steps=100, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(env, 'test/', episode_trigger=lambda i: i%1==0)

episodes = 10


for episode in range(episodes):
    done = False
    obs, _ = env.reset()
    while not done:
        obs, r, term, trunc, dict = env.step(env.action_space.sample())
        if term or trunc:
            done = True
env.close()

In [None]:
env = gym.make("FetchReach-v2")
env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

# The following always has to hold:
assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
assert truncated == env.compute_truncated(obs["achieved_goal"], obs["desired_goal"], info)
assert terminated == env.compute_terminated(obs["achieved_goal"], obs["desired_goal"], info)

In [None]:
env.compute_reward()

In [None]:
env = gym.make('FetchPush-v2', render_mode='rgb_array')

In [None]:
if hasattr(env, "distance_threshold"):
    print('true')
else:
    print('false')

In [None]:
if env.get_wrapper_attr("distance_threshold"):
    print('true')

In [None]:
print(dir(env))


# DDPG

In [2]:
# num_envs = 4
N = 3
# env = gym.make('BipedalWalker-v3')
# env = gym.make('Pendulum-v1')
env = gym.make('InvertedPendulum-v5')
wrappers = [
    {
        "type": "NStepReward",
        "params": {
            "n": N
        }
    }
]

env_spec = env.spec
env_wrap = GymnasiumWrapper(env_spec, wrappers)
# env_wrap.env = env_wrap._initialize_env(num_envs=num_envs)

In [3]:
# build actor
device = 'cuda'
actor_optimizer = {'type': 'Adam','params': { 'lr': 0.001 }}

layer_config = [
    {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'},
]
# output_layer_config = [{'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}}]
output_layer_config = [{'type': 'dense', 'params': {'kernel': 'uniform', 'kernel params':{'a':-3e-3, 'b':3e-3}}}]

actor = ActorModel(env_wrap, layer_config, output_layer_config, optimizer_params=actor_optimizer, device=device)

In [None]:
actor

In [None]:
actor.get_config()

In [6]:
# build critic
# critic_optimizer = {'type': 'Adam','params': { 'lr': 0.001, 'weight_decay':0.01}}
critic_optimizer = {'type': 'Adam','params': { 'lr': 0.001}}

state_layer_config = [
    {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'}
]

merged_layer_config = [
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'relu'},
]
# output_layer_config = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},

critic = CriticModel(env_wrap, state_layers=state_layer_config, merged_layers=merged_layer_config,
                    output_layer_kernel=output_layer_config, optimizer_params=critic_optimizer, device=device)

In [7]:
replay_buffer = ReplayBuffer(env_wrap, 1000000, N=N, device='cpu')
# replay_buffer = PrioritizedReplayBuffer(env_wrap, 100000, alpha=0.6, beta_start=0.4, beta_iter=10000, beta_update_freq=1, priority='rank',normalize=False, epsilon=0.01, N=N, device='cpu')
noise = NormalNoise(shape=env_wrap.action_space.shape, stddev=0.1, device=device)

In [None]:
replay_buffer.get_config()

In [9]:
ddpg_agent = DDPG(env=env_wrap,
                actor_model=actor,
                critic_model=critic,
                replay_buffer=replay_buffer,
                discount=0.99,
                tau=0.005,
                action_epsilon=0.2,
                batch_size=128,
                noise=noise,
                grad_clip=40.0,
                warmup=1000,
                N=N,
                callbacks=[rl_callbacks.WandbCallback('InvertedPendulum-v5')],
                save_dir='InvertedPendulum_N3',
                device=device,
                log_level='info')

In [10]:
ddpg_agent.save()

In [None]:
ddpg_agent.train(10, 2, 42, 100)

In [11]:
config = ddpg_agent.get_config()

# Set train config and path
train_config = {
    'num_episodes': 2000,
    'num_envs': 4,
    'seed': 42,
    'render_freq': 500,
}
train_config_path = config["save_dir"] + 'train_config.json'
with open(train_config_path, 'w') as f:
    json.dump(train_config, f)

In [None]:
T.unique(ddpg_agent.replay_buffer.states).size()

In [None]:
ddpg_agent.test(10, True, 1)

In [14]:
config_file_path = '/workspaces/RL_Agents/src/app/models/ddpg/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

In [None]:
ddpg = DDPG.load(config)

In [None]:
ddpg.get_config()

In [None]:
ddpg.test(10, 1)

# N Step Walkthrough

In [12]:
from torch_utils import set_seed

In [13]:
num_episodes = 10
num_envs = 2
seed = 42
sync_iter = 1

# set models to train mode
ddpg_agent.actor_model.train()
ddpg_agent.critic_model.train()
# Set target models to eval mode
ddpg_agent.target_actor_model.eval()
ddpg_agent.target_critic_model.eval()

    # set num_envs as attribute
ddpg_agent.num_envs = num_envs

if seed is None:
    seed = np.random.randint(100)

# Set render freq to 0 if None is passed
# if render_freq == None:
#     render_freq = 0

# Set seeds
set_seed(seed)

# Set sync_interval (for distributed learning)
ddpg_agent._sync_iter = sync_iter

In [14]:
try:
    # instantiate new vec environment
    ddpg_agent.env.env = ddpg_agent.env._initialize_env(0, ddpg_agent.num_envs, seed)
except Exception as e:
    ddpg_agent.logger.error(f"Error in DDPG.train self.env")

# initialize step counter (for logging)
ddpg_agent._step = 0
best_reward = -np.inf
score_history = deque(maxlen=100)
# trajectories = [[] for _ in range(self.num_envs)]
episode_scores = np.zeros(ddpg_agent.num_envs)
ddpg_agent.completed_episodes = np.zeros(ddpg_agent.num_envs)
# Initialize environments
states, _ = ddpg_agent.env.reset()

In [15]:
ddpg_agent._step += 1
actions = ddpg_agent.get_action(states)
# Format actions
actions = ddpg_agent.env.format_actions(actions)
next_states, rewards, dones, infos = ddpg_agent.env.step(actions)
episode_scores += rewards
# dones = np.logical_or(terms, truncs)

ddpg_agent.replay_buffer.add(
    infos['n-step trajectory']['states'],
    infos['n-step trajectory']['actions'],
    infos['n-step trajectory']['rewards'],
    infos['n-step trajectory']['next_states'],
    infos['n-step trajectory']['dones']
)

In [None]:
ddpg_agent.replay_buffer.next_states

## Distributed

In [None]:
distributed_ddpg = DistributedAgents(ddpg_agent.get_config(),
                                     num_workers=4,
                                     learner_device=None,
                                     learn_freq=10,
                                     log_level='debug')

In [None]:
train_config = {
    'num_episodes': 100,
    'num_envs': 4,
    'seed': 42,
    'render_freq': 100,
}

distributed_ddpg.train(sync_interval=30, **train_config)

In [19]:
t1 = [T.tensor([[ 1.9781e+00,  2.9300e-02, -4.5485e-01,  1.0052e+00, -2.4612e+00,
          3.9259e-01,  9.8080e-01,  9.3881e-01,  2.2028e+00, -2.8109e+00,
          3.8870e-01,  2.0195e+00,  3.1939e-02,  8.6454e-01,  3.3472e-01,
          3.3852e-01,  3.5036e-01,  3.7172e-01,  4.0555e-01,  4.5746e-01,
          5.3847e-01,  6.7270e-01,  7.8548e-01, -7.1772e-01],
        [-4.6067e+00, -8.5158e-02,  7.7986e-01, -1.6501e+00,  5.0755e+00,
         -8.5706e-01, -1.2790e+00, -1.0014e+00, -3.6259e+00,  4.8458e+00,
         -1.1071e+00, -2.4277e+00,  4.8323e-01, -1.5494e+00, -5.7167e-01,
         -5.7816e-01, -5.9840e-01, -6.3487e-01, -6.9265e-01, -7.8130e-01,
         -9.1966e-01, -1.1489e+00, -1.3307e+00,  1.1856e+00],
        [-3.3395e+00, -2.3950e-01,  6.9444e-01,  9.4437e-01, -5.1564e-01,
          1.8767e-01, -1.3053e+00, -2.7629e-01,  7.2648e-01, -8.1033e-01,
          4.2657e-01, -9.2496e-01, -9.6485e-01, -1.4170e-02,  1.6201e-01,
          1.6385e-01,  1.6959e-01,  1.7992e-01,  1.9630e-01,  2.2142e-01,
          2.6063e-01,  3.2560e-01,  4.4705e-01, -2.1345e-01],
        [ 6.1286e+00,  2.2062e-01, -1.1039e+00,  7.5685e-01, -3.2970e+00,
          4.4821e-01,  2.0052e+00,  6.4892e-01,  1.9993e+00, -5.1402e+00,
          8.0700e-01,  3.8900e+00,  2.0480e+00,  1.0676e+00,  5.7863e-02,
          5.8521e-02,  6.0569e-02,  6.4261e-02,  7.0109e-02,  7.9082e-02,
          9.3086e-02,  1.1629e-01, -6.0080e-03, -1.3927e+00],
        [ 6.8055e+00,  2.7492e-01, -1.2478e+00,  6.1935e-01, -2.9432e+00,
          2.8830e-01,  1.4758e+00,  1.1202e+00,  2.1173e+00, -4.2106e+00,
          8.7564e-01,  3.6644e+00,  1.9683e+00,  9.4719e-01,  1.5157e-01,
          1.5329e-01,  1.5865e-01,  1.6832e-01,  1.8364e-01,  2.0715e-01,
          2.4383e-01,  3.0461e-01,  2.9630e-01, -7.7133e-01],
        [ 3.0987e+00,  1.0051e-01, -7.0230e-01,  1.0258e+00, -2.0977e+00,
          6.0411e-01,  2.9427e-01,  1.1515e+00,  1.9098e+00, -2.6412e+00,
          1.2812e+00,  2.0923e+00,  4.9255e-02,  1.0491e+00,  3.6872e-01,
          3.7290e-01,  3.8595e-01,  4.0948e-01,  4.4675e-01,  5.0392e-01,
          5.9316e-01,  7.4103e-01,  8.5930e-01, -6.0360e-01],
        [ 2.3348e+00,  4.2652e-02, -5.1585e-01,  1.0399e+00, -2.5608e+00,
          4.3550e-01,  9.1680e-01,  1.1512e+00,  2.3475e+00, -3.2432e+00,
          9.3177e-01,  2.0315e+00, -5.1433e-01,  8.4729e-01,  3.3179e-01,
          3.3556e-01,  3.4730e-01,  3.6847e-01,  4.0201e-01,  4.5346e-01,
          5.3376e-01,  6.6682e-01,  7.7946e-01, -8.7560e-01],
        [ 1.2456e+00,  1.8144e-02, -2.9436e-01,  8.2807e-01, -1.6374e+00,
          4.8683e-01,  2.0424e-01,  1.1818e-01,  1.2919e+00, -1.8045e+00,
          7.7424e-01,  1.5638e+00,  4.5440e-01,  5.9611e-01,  1.7324e-01,
          1.7521e-01,  1.8134e-01,  1.9240e-01,  2.0991e-01,  2.3677e-01,
          2.7870e-01,  3.4818e-01,  3.8399e-01, -6.8423e-01],
        [-2.6813e+00, -2.2750e-01,  6.0622e-01,  8.2969e-01, -2.2157e-01,
          4.0851e-01, -1.5937e+00,  1.3547e-02,  1.3178e+00, -4.7801e-01,
          9.3756e-01,  4.3855e-01,  3.2618e-01,  5.4413e-01,  4.9621e-01,
          5.0185e-01,  5.1941e-01,  5.5107e-01,  6.0122e-01,  6.7817e-01,
          7.9827e-01,  9.9726e-01,  1.3115e+00,  1.1633e+00],
        [ 3.1350e+00,  3.3136e-02, -5.8090e-01,  1.4118e+00, -3.7456e+00,
          6.8638e-01,  1.1218e+00,  1.2487e+00,  2.9678e+00, -4.1772e+00,
          1.2304e+00,  2.3360e+00, -4.0654e-01,  1.1904e+00,  4.3887e-01,
          4.4385e-01,  4.5938e-01,  4.8739e-01,  5.3174e-01,  5.9980e-01,
          7.0602e-01,  8.8202e-01,  1.0475e+00, -9.7193e-01]]), T.tensor([-1.2826,  2.0776, -0.9381, -1.8182, -1.1197, -1.2902, -1.4605, -1.2887,
         0.3230, -1.8059]), T.tensor([[-0.0422, -0.8016,  0.1579,  0.4014,  0.2079, -0.0332,  0.3065, -0.9145,
         -0.2028, -0.1832, -0.0109,  0.0716,  0.4895,  0.3676],
        [-1.0795,  0.8898, -1.2500, -1.1461, -0.7623, -0.2900, -1.7939,  0.2717,
         -0.2666, -0.9858, -0.1224,  0.3499, -0.8556, -0.4665],
        [-0.7600,  0.5804, -0.8709, -0.7988, -0.5327, -0.2008, -1.2529,  0.1773,
         -0.1943, -0.6906, -0.0809,  0.2503, -0.6021, -0.3237],
        [-0.6606,  0.4213, -0.7522, -0.6903, -0.4608, -0.1749, -1.0906,  0.1277,
         -0.1738, -0.6039, -0.0709,  0.2162, -0.5212, -0.2819],
        [-0.8219,  0.8583, -0.9700, -0.8878, -0.5826, -0.2163, -1.3749,  0.3033,
         -0.1777, -0.7601, -0.0847,  0.2710, -0.6463, -0.3597],
        [-0.9559,  1.0077, -1.1208, -1.0268, -0.6748, -0.2471, -1.5995,  0.3593,
         -0.2017, -0.8868, -0.1142,  0.3143, -0.7408, -0.4008],
        [-0.6996,  0.6932, -0.8155, -0.7475, -0.4898, -0.1791, -1.1662,  0.2477,
         -0.1468, -0.6448, -0.0945,  0.2369, -0.5536, -0.2843],
        [-0.6854,  0.8011, -0.8260, -0.7569, -0.4982, -0.1771, -1.1601,  0.2914,
         -0.1399, -0.6367, -0.0832,  0.2278, -0.5407, -0.3010],
        [-0.9841,  1.1210, -1.1683, -1.0702, -0.6988, -0.2516, -1.6502,  0.4082,
         -0.1956, -0.9049, -0.1152,  0.3231, -0.7678, -0.4335],
        [-0.8280,  0.9214, -0.9795, -0.8973, -0.5868, -0.2120, -1.3820,  0.3340,
         -0.1683, -0.7588, -0.0940,  0.2733, -0.6511, -0.3593]]), T.tensor([-0.1637, -0.6195, -0.4485, -0.4163, -0.4074, -0.4680, -0.3543, -0.3162,
        -0.4537, -0.3880]), T.tensor([[-1.9799,  4.7743,  5.6850,  4.1981,  4.7544,  5.8809,  4.8192,  6.2063,
          4.9247,  5.7111]]), T.tensor([1.1162])]

In [None]:
t2 = [T.tensor([[-3.0210, -0.0610,  0.1870,  1.3492,  1.9128,  0.6030,  1.0716,  0.3754,
          1.4766,  1.6674,  0.5560, -0.6107, -2.3209,  0.0915,  0.4787,  0.4842,
          0.5011,  0.5317,  0.5801,  0.6543,  0.7702,  0.9621,  1.1827, -0.1124],
        [ 3.9005,  0.0995, -0.4463, -2.2449, -1.9636, -0.9523, -0.9744, -0.5649,
         -1.8575, -2.0559, -1.2822,  1.7660,  3.9600, -0.2762, -0.8333, -0.8427,
         -0.8722, -0.9254, -1.0096, -1.1388, -1.3405, -1.6747, -2.0584,  0.0768],
        [-6.0434, -0.2660,  0.8613,  1.2666,  0.3657,  0.4043,  0.2389,  0.0420,
         -0.4318, -0.2625, -0.1022, -1.2634, -2.9789,  0.0556,  0.0564,  0.0571,
          0.0591,  0.0627,  0.0684,  0.0771,  0.0908,  0.1134,  0.1518, -0.9142],
        [ 2.4498,  0.1666, -0.4657,  0.7954,  1.4183,  0.2234,  0.8693,  0.1640,
          1.7937,  0.2179,  1.0497,  0.7746,  0.4628, -0.1557,  0.3791,  0.3834,
          0.3968,  0.4210,  0.4593,  0.5181,  0.6099,  0.7619,  0.8833,  0.1880],
        [ 2.8586,  0.2016, -0.5509,  0.7166,  1.6745,  0.1015,  0.4443,  0.4552,
          1.9645,  0.9740,  1.0916,  0.7404,  0.5557, -0.2037,  0.4744,  0.4798,
          0.4966,  0.5269,  0.5748,  0.6484,  0.7632,  0.9535,  1.1732,  0.7246],
        [-2.6782, -0.0171,  0.1804,  1.4666,  1.8017,  0.4859,  0.9921,  0.4301,
          0.3644,  1.2674,  0.6468, -0.2504, -2.3037,  0.2129,  0.3462,  0.3502,
          0.3624,  0.3845,  0.4195,  0.4732,  0.5570,  0.6959,  0.8007, -0.6199],
        [-3.3263, -0.0738,  0.1812,  1.0704,  1.5368,  0.5137,  0.3134,  0.4082,
          1.2941,  1.0506, -0.0810, -0.2996, -2.1341,  0.9454,  0.6118,  0.6188,
          0.6404,  0.6795,  0.7413,  0.8362,  0.9842,  1.2296,  1.5561,  0.3532],
        [-2.0407, -0.0232,  0.1365,  1.0193,  1.0490,  0.3524,  0.4713, -0.1737,
          0.6543,  0.9926,  0.4739, -0.0258, -1.2229,  0.1204,  0.2555,  0.2584,
          0.2674,  0.2837,  0.3096,  0.3492,  0.4110,  0.5135,  0.6139, -0.3420],
        [-3.8335, -0.0793,  0.5133,  1.4524,  1.2858,  0.2380,  0.2464, -0.4693,
          0.2244,  1.1802,  0.6321, -0.6566, -2.1882, -0.3009,  0.0969,  0.0980,
          0.1014,  0.1076,  0.1174,  0.1324,  0.1558,  0.1947,  0.2167, -0.7694],
        [-4.1113, -0.1023,  0.2936,  1.4175,  1.4192,  0.6452,  0.4410,  0.2712,
          1.5895,  1.1937, -0.0969, -0.3495, -2.6013,  1.1924,  0.7412,  0.7497,
          0.7759,  0.8232,  0.8981,  1.0131,  1.1924,  1.4897,  1.8926,  0.3718]]), T.tensor([-1.3106,  1.8997, -1.9664, -0.6065, -0.0346, -1.9460, -0.6598, -1.3116,
        -2.1798, -0.9779]), T.tensor([[-0.1001, -1.0269,  0.6850,  0.2108,  0.3047,  0.1504, -0.7376, -0.7068,
          0.6134, -0.6132, -0.0582,  0.2512, -0.0415,  0.2464],
        [-1.1352,  1.0962, -2.0336, -0.9179, -0.9472, -0.5573, -0.7028, -0.2256,
         -1.3325, -0.6510, -0.2182,  0.2025, -0.3683, -0.1401],
        [-0.7934,  0.7257, -1.4178, -0.6400, -0.6616, -0.3889, -0.4958, -0.1750,
         -0.9335, -0.4587, -0.1483,  0.1495, -0.2620, -0.0945],
        [-0.6914,  0.5368, -1.2250, -0.5532, -0.5722, -0.3374, -0.4346, -0.1816,
         -0.8165, -0.4004, -0.1304,  0.1313, -0.2295, -0.0827],
        [-0.8653,  1.0087, -1.5789, -0.7117, -0.7295, -0.4256, -0.5347, -0.0922,
         -1.0090, -0.5003, -0.1611,  0.1564, -0.2789, -0.1066],
        [-0.9981,  1.2073, -1.8238, -0.8224, -0.8426, -0.4873, -0.6167, -0.0831,
         -1.1556, -0.5773, -0.1941,  0.1833, -0.3136, -0.1120],
        [-0.7336,  0.8192, -1.3286, -0.5992, -0.6143, -0.3579, -0.4565, -0.0887,
         -0.8472, -0.4261, -0.1517,  0.1435, -0.2420, -0.0756],
        [-0.7258,  0.9153, -1.3452, -0.6066, -0.6237, -0.3612, -0.4451, -0.0534,
         -0.8485, -0.4169, -0.1431,  0.1355, -0.2299, -0.0887],
        [-1.0384,  1.2801, -1.9018, -0.8573, -0.8769, -0.5102, -0.6358, -0.0789,
         -1.2001, -0.5936, -0.2034,  0.1918, -0.3289, -0.1280],
        [-0.8687,  1.0686, -1.5950, -0.7189, -0.7362, -0.4261, -0.5291, -0.0663,
         -1.0120, -0.4976, -0.1674,  0.1614, -0.2795, -0.1053]]), T.tensor([-0.2694, -0.6020, -0.4351, -0.4105, -0.4003, -0.4449, -0.3495, -0.3165,
        -0.4542, -0.3832]), T.tensor([[-1.7109,  6.2001,  7.4709,  5.1641,  6.4154,  6.4333,  6.1691,  6.3029,
          6.2611,  7.3640]]), T.tensor([1.0784])]

In [None]:
def compute_average_gradients(gradient_lists):
    """
    Compute average gradients across workers
    
    Args:
        gradient_lists: List of gradient lists from different workers
                       e.g., [t1, t2] where t1 and t2 are lists of tensors
    
    Returns:
        List of averaged gradient tensors
    """
    avg_gradients = []
    # Loop through each parameter position
    for i in range(len(gradient_lists[0])):
        # Stack the same parameter from all workers
        stacked_grads = T.stack([grads[i] for grads in gradient_lists])
        # Average across workers (dim=0)
        avg_grad = stacked_grads.mean(dim=0)
        avg_gradients.append(avg_grad)
    
    return avg_gradients

# Use it like this:
averaged_gradients = compute_average_gradients([t1, t2])

In [None]:
for e, (a,b,avg) in enumerate(zip(t1, t2, averaged_gradients)):
    print(f'{e}: a shape: {a.shape} b shape: {b.shape} avg shape: {avg.shape}')

In [39]:
avg_grads = [T.tensor([[-5.2145e-01, -1.5849e-02, -1.3393e-01,  1.1772e+00, -2.7421e-01,
          4.9781e-01,  1.0262e+00,  6.5711e-01,  1.8397e+00, -5.7179e-01,
          4.7236e-01,  7.0439e-01, -1.1445e+00,  4.7803e-01,  4.0673e-01,
          4.1135e-01,  4.2574e-01,  4.5169e-01,  4.9280e-01,  5.5587e-01,
          6.5431e-01,  8.1742e-01,  9.8408e-01, -4.1506e-01],
        [-3.5306e-01,  7.1664e-03,  1.6678e-01, -1.9475e+00,  1.5559e+00,
         -9.0469e-01, -1.1267e+00, -7.8315e-01, -2.7417e+00,  1.3949e+00,
         -1.1947e+00, -3.3088e-01,  2.2216e+00, -9.1279e-01, -7.0248e-01,
         -7.1045e-01, -7.3532e-01, -7.8014e-01, -8.5114e-01, -9.6008e-01,
         -1.1301e+00, -1.4118e+00, -1.6945e+00,  6.3116e-01],
        [-4.6914e+00, -2.5275e-01,  7.7788e-01,  1.1055e+00, -7.4956e-02,
          2.9597e-01, -5.3322e-01, -1.1717e-01,  1.4735e-01, -5.3639e-01,
          1.6220e-01, -1.0942e+00, -1.9719e+00,  2.0733e-02,  1.0922e-01,
          1.1046e-01,  1.1433e-01,  1.2130e-01,  1.3234e-01,  1.4927e-01,
          1.7571e-01,  2.1951e-01,  2.9945e-01, -5.6381e-01],
        [ 4.2892e+00,  1.9360e-01, -7.8484e-01,  7.7615e-01, -9.3934e-01,
          3.3579e-01,  1.4373e+00,  4.0648e-01,  1.8965e+00, -2.4611e+00,
          9.2833e-01,  2.3323e+00,  1.2554e+00,  4.5595e-01,  2.1848e-01,
          2.2097e-01,  2.2870e-01,  2.4264e-01,  2.6472e-01,  2.9860e-01,
          3.5148e-01,  4.3910e-01,  4.3864e-01, -6.0237e-01],
        [ 4.8320e+00,  2.3824e-01, -8.9936e-01,  6.6797e-01, -6.3437e-01,
          1.9491e-01,  9.6008e-01,  7.8771e-01,  2.0409e+00, -1.6183e+00,
          9.8363e-01,  2.2024e+00,  1.2620e+00,  3.7173e-01,  3.1300e-01,
          3.1656e-01,  3.2763e-01,  3.4761e-01,  3.7924e-01,  4.2778e-01,
          5.0353e-01,  6.2906e-01,  7.3477e-01, -2.3374e-02],
        [ 2.1023e-01,  4.1719e-02, -2.6093e-01,  1.2462e+00, -1.4800e-01,
          5.4502e-01,  6.4319e-01,  7.9078e-01,  1.1371e+00, -6.8694e-01,
          9.6396e-01,  9.2096e-01, -1.1272e+00,  6.3097e-01,  3.5748e-01,
          3.6154e-01,  3.7420e-01,  3.9701e-01,  4.3314e-01,  4.8857e-01,
          5.7509e-01,  7.1845e-01,  8.3001e-01, -6.1175e-01],
        [-4.9578e-01, -1.5556e-02, -1.6734e-01,  1.0552e+00, -5.1201e-01,
          4.7462e-01,  6.1509e-01,  7.7972e-01,  1.8208e+00, -1.0963e+00,
          4.2536e-01,  8.6592e-01, -1.3242e+00,  8.9632e-01,  4.7180e-01,
          4.7716e-01,  4.9386e-01,  5.2397e-01,  5.7165e-01,  6.4482e-01,
          7.5900e-01,  9.4822e-01,  1.1678e+00, -2.6122e-01],
        [-3.9754e-01, -2.5136e-03, -7.8935e-02,  9.2367e-01, -2.9418e-01,
          4.1963e-01,  3.3779e-01, -2.7741e-02,  9.7311e-01, -4.0594e-01,
          6.2407e-01,  7.6900e-01, -3.8424e-01,  3.5828e-01,  2.1437e-01,
          2.1680e-01,  2.2439e-01,  2.3807e-01,  2.5973e-01,  2.9298e-01,
          3.4486e-01,  4.3083e-01,  4.9894e-01, -5.1309e-01],
        [-3.2574e+00, -1.5342e-01,  5.5975e-01,  1.1411e+00,  5.3212e-01,
          3.2328e-01, -6.7366e-01, -2.2789e-01,  7.7112e-01,  3.5112e-01,
          7.8484e-01, -1.0904e-01, -9.3099e-01,  1.2163e-01,  2.9654e-01,
          2.9991e-01,  3.1040e-01,  3.2932e-01,  3.5929e-01,  4.0528e-01,
          4.7705e-01,  5.9597e-01,  7.6414e-01,  1.9698e-01],
        [-4.8813e-01, -3.4580e-02, -1.4366e-01,  1.4147e+00, -1.1632e+00,
          6.6578e-01,  7.8140e-01,  7.5995e-01,  2.2787e+00, -1.4917e+00,
          5.6674e-01,  9.9323e-01, -1.5039e+00,  1.1914e+00,  5.9005e-01,
          5.9675e-01,  6.1764e-01,  6.5529e-01,  7.1492e-01,  8.0643e-01,
          9.4923e-01,  1.1859e+00,  1.4701e+00, -3.0006e-01]]), T.tensor([-1.2966,  1.9887, -1.4522, -1.2124, -0.5771, -1.6181, -1.0601, -1.3002,
        -0.9284, -1.3919]), T.tensor([[-7.1154e-02, -9.1427e-01,  4.2147e-01,  3.0611e-01,  2.5633e-01,
          5.8600e-02, -2.1559e-01, -8.1068e-01,  2.0530e-01, -3.9817e-01,
         -3.4518e-02,  1.6142e-01,  2.2401e-01,  3.0699e-01],
        [-1.1073e+00,  9.9301e-01, -1.6418e+00, -1.0320e+00, -8.5473e-01,
         -4.2367e-01, -1.2483e+00,  2.3064e-02, -7.9954e-01, -8.1840e-01,
         -1.7031e-01,  2.7618e-01, -6.1193e-01, -3.0330e-01],
        [-7.7668e-01,  6.5305e-01, -1.1443e+00, -7.1940e-01, -5.9716e-01,
         -2.9489e-01, -8.7439e-01,  1.1596e-03, -5.6388e-01, -5.7466e-01,
         -1.1456e-01,  1.9988e-01, -4.3206e-01, -2.0907e-01],
        [-6.7601e-01,  4.7906e-01, -9.8860e-01, -6.2174e-01, -5.1650e-01,
         -2.5613e-01, -7.6256e-01, -2.6988e-02, -4.9511e-01, -5.0215e-01,
         -1.0062e-01,  1.7376e-01, -3.7535e-01, -1.8225e-01],
        [-8.4359e-01,  9.3349e-01, -1.2744e+00, -7.9973e-01, -6.5602e-01,
         -3.2095e-01, -9.5480e-01,  1.0552e-01, -5.9335e-01, -6.3023e-01,
         -1.2289e-01,  2.1371e-01, -4.6257e-01, -2.3313e-01],
        [-9.7698e-01,  1.1075e+00, -1.4723e+00, -9.2457e-01, -7.5868e-01,
         -3.6720e-01, -1.1081e+00,  1.3806e-01, -6.7862e-01, -7.3205e-01,
         -1.5414e-01,  2.4881e-01, -5.2719e-01, -2.5636e-01],
        [-7.1661e-01,  7.5619e-01, -1.0721e+00, -6.7334e-01, -5.5205e-01,
         -2.6852e-01, -8.1136e-01,  7.9512e-02, -4.9702e-01, -5.3545e-01,
         -1.2309e-01,  1.9023e-01, -3.9781e-01, -1.7993e-01],
        [-7.0560e-01,  8.5818e-01, -1.0856e+00, -6.8176e-01, -5.6099e-01,
         -2.6915e-01, -8.0264e-01,  1.1899e-01, -4.9424e-01, -5.2679e-01,
         -1.1314e-01,  1.8165e-01, -3.8532e-01, -1.9486e-01],
        [-1.0112e+00,  1.2006e+00, -1.5350e+00, -9.6373e-01, -7.8788e-01,
         -3.8092e-01, -1.1430e+00,  1.6468e-01, -6.9785e-01, -7.4929e-01,
         -1.5927e-01,  2.5745e-01, -5.4834e-01, -2.8073e-01],
        [-8.4833e-01,  9.9501e-01, -1.2873e+00, -8.0808e-01, -6.6150e-01,
         -3.1903e-01, -9.5554e-01,  1.3387e-01, -5.9017e-01, -6.2816e-01,
         -1.3070e-01,  2.1735e-01, -4.6529e-01, -2.3230e-01]]), T.tensor([-0.2166, -0.6108, -0.4418, -0.4134, -0.4038, -0.4564, -0.3519, -0.3163,
        -0.4539, -0.3856]), T.tensor([[-1.8454,  5.4872,  6.5779,  4.6811,  5.5849,  6.1571,  5.4942,  6.2546,
          5.5929,  6.5376]]), T.tensor([1.0973])]

In [None]:
avg_grads

In [None]:
for e, (a,b) in enumerate(zip(averaged_gradients, avg_grads)):
    if T.allclose(a, b, atol=1e-05):
        print(f'{e}: {a.shape} {b.shape} True')
    else:
        print(f'{e}: {a.shape} {b.shape} False')

In [None]:
t1[0][0]

In [None]:
t2[0][0]

In [None]:
averaged_gradients[0][1]

In [None]:
avg_grads[0][1]

# TD3

In [12]:
# num_envs = 4
N = 5
env = gym.make('BipedalWalker-v3')
# env = gym.make('Pendulum-v1')
# env = gym.make('InvertedPendulum-v5')
wrappers = [
    {
        "type": "NStepReward",
        "params": {
            "n": N
        }
    }
]

env_spec = env.spec
env_wrap = GymnasiumWrapper(env_spec, wrappers)
# env_wrap.env = env_wrap._initialize_env(num_envs=num_envs)

In [13]:
# build actor
device = 'cuda'
optimizer = {'type': 'Adam','params': { 'lr': 0.001 }}

layer_config = [
    # {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'},
]
output_layer_config = [{'type': 'dense', 'params': {'kernel': 'uniform', 'kernel params':{'a':-3e-3, 'b':3e-3}}}]

actor = ActorModel(env_wrap, layer_config, output_layer_config, device=device)

In [14]:
# build critic

state_layer_config = [
    # {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'batchnorm1d'},
    {'type': 'relu'}
]

merged_layer_config = [
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'variance_scaling', 'kernel params':{"scale": 1.0, "mode": "fan_in", "distribution": "uniform"}}},
    {'type': 'relu'}
]
# output_layer_config = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},

critic = CriticModel(env_wrap, state_layers=state_layer_config, merged_layers=merged_layer_config,
                    output_layer_kernel=output_layer_config, optimizer_params=optimizer, device=device)

In [15]:
replay_buffer = ReplayBuffer(env_wrap, 1000000, N=1, device='cpu')
# replay_buffer = PrioritizedReplayBuffer(env_wrap, 100000, alpha=0.6, beta_start=0.4, beta_iter=10000, beta_update_freq=1, priority='rank',normalize=False, epsilon=0.01, N=N, device='cpu')
noise = NormalNoise(shape=env_wrap.action_space.shape, stddev=0.1, device=device)

In [16]:
td3 = TD3(
    env=env_wrap,
    actor_model=actor,
    critic_model_a=critic,
    discount=0.99,
    tau=0.005,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    target_noise=noise,
    actor_update_delay = 1,
    grad_clip=40.0,
    warmup=1000,
    N=N,
    callbacks=[rl_callbacks.WandbCallback('BipedalWalker-v3')],
    save_dir='BipedalWalker_N5',
    device='cuda'
)

In [17]:
td3.save()

In [None]:
td3.train(50, 1, 42, 50)

In [18]:
config = td3.get_config()
# Set train config and path
train_config = {
    'num_episodes': 2000,
    'num_envs': 4,
    'seed': 42,
    'render_freq': 500,
}
train_config_path = config["save_dir"] + 'train_config.json'
with open(train_config_path, 'w') as f:
    json.dump(train_config, f)

In [3]:
td3 = TD3.load(config)

In [None]:
td3.get_config()

In [None]:
td3.state_normalizer.device

## Distributed

In [None]:
distributed_td3 = DistributedAgents(td3.get_config(),
                                     num_workers=4,
                                     learner_device=None,
                                     learn_freq=10,
                                     log_level='debug')

# HER/DDPG

In [2]:
N = 3
env = gym.make('FetchReach-v4')
# env = gym.make('FetchPush-v4')
# env = gym.make('Pendulum-v1')
# env = gym.make('InvertedPendulum-v5')
wrappers = [
    {
        "type": "NStepReward",
        "params": {
            "n": N
        }
    }
]

env_spec = env.spec
env_wrap = GymnasiumWrapper(env_spec, wrappers)

In [3]:
# GOAL SHAPE
goal_shape = env.observation_space['desired_goal'].shape
print(f'goal_shape: {goal_shape}')

goal_shape: (3,)


In [4]:
# build actor
device = 'cuda'
optimizer = {'type': 'Adam','params': { 'lr': 0.001 }}

layer_config = [
    # {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'},
]
output_layer_config = [{'type': 'dense', 'params': {'kernel': 'uniform', 'kernel params':{'a':-3e-3, 'b':3e-3}}}]

actor = ActorModel(env_wrap, layer_config, output_layer_config, device=device)

In [5]:
# build critic

state_layer_config = [

]

merged_layer_config = [
    # {'type': 'batchnorm1d'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'xavier_uniform', 'kernel params':{"gain": 1.0}}},
    # {'type': 'batchnorm1d'},
    {'type': 'relu'}
]
# output_layer_config = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},

critic = CriticModel(env_wrap, state_layers=state_layer_config, merged_layers=merged_layer_config,
                    output_layer_kernel=output_layer_config, optimizer_params=optimizer, device=device)

In [6]:
replay_buffer = ReplayBuffer(env_wrap, 1000000, goal_shape=env.observation_space['desired_goal'].shape, N=N, device='cpu')
# replay_buffer = PrioritizedReplayBuffer(env_wrap, 100000, beta_start=0.4, beta_iter=20000, beta_update_freq=1, priority='rank',normalize=False, goal_shape=goal_shape, epsilon=0.01, device=device)
noise = NormalNoise(shape=env_wrap.action_space.shape, mean=0.0, stddev=0.1, device=device)
# schedule_config = {'type':'Linear', 'params':{'start_factor':1.0, 'end_factor':0.1, 'total_iters':5000}}
# noise_schedule = ScheduleWrapper(schedule_config)
noise_schedule = None

In [7]:
replay_buffer.get_config()

{'class_name': 'ReplayBuffer',
 'config': {'env': '{"type": "GymnasiumWrapper", "env": "{\\"id\\": \\"FetchReach-v4\\", \\"entry_point\\": \\"gymnasium_robotics.envs.fetch.reach:MujocoFetchReachEnv\\", \\"reward_threshold\\": null, \\"nondeterministic\\": false, \\"max_episode_steps\\": 50, \\"order_enforce\\": true, \\"disable_env_checker\\": false, \\"kwargs\\": {\\"reward_type\\": \\"sparse\\"}, \\"additional_wrappers\\": [], \\"vector_entry_point\\": null}", "wrappers": [{"type": "NStepReward", "params": {"n": 3}}], "worker_id": 0}',
  'buffer_size': 1000000,
  'goal_shape': (3,),
  'N': 3,
  'device': 'cpu'}}

In [8]:
ddpg_agent = DDPG(env=env_wrap,
                actor_model=actor,
                critic_model=critic,
                replay_buffer=replay_buffer,
                discount=0.98,
                tau=0.05,
                action_epsilon=0.2,
                batch_size=128,
                noise=noise,
                noise_schedule=noise_schedule,
                grad_clip=40.0,
                warmup=1000,
                N=N,
                callbacks=[rl_callbacks.WandbCallback('FetchReach-v4')],
                save_dir='FetchReach_N3',
                device=device)

In [9]:
her = HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    save_dir='FetchReach_N3',
)

In [10]:
her.save()

In [11]:
config = her.get_config()

# Set train config and path
train_config = {
    'num_epochs': 100,
    'num_cycles': 50,
    'num_episodes': 1,
    'num_updates': 40,
    'num_envs': 16,
    'seed': 42,
    'render_freq': 500,
}
train_config_path = config["save_dir"] + 'train_config.json'
with open(train_config_path, 'w') as f:
    json.dump(train_config, f)

In [None]:
num_epochs = 100
num_cycles = 50
num_episodes = 1
num_updates = 40
render_freq = 100
num_envs = 16
seed = 42

her.train(num_epochs, num_cycles, num_episodes, num_updates, render_freq, num_envs, seed)

In [None]:
T.unique(her.agent.replay_buffer.states, dim=0).size()

In [None]:
her.agent.replay_buffer.states.size()

In [None]:
T.count_nonzero(her.agent.replay_buffer.states, dim=0)

In [12]:
config_file_path = '/workspaces/PhoenX_RL/src/app/FetchReach_N3/her/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

In [29]:
her = HER.load(config, load_weights=False)

In [None]:
her.save_dir

In [None]:
save_dir = '/'.join(save_dir.split('/')[:-1] + ['worker-0'])
save_dir

In [None]:
save_dir.split('/')[-1] = 'worker-0'

In [None]:
save_dir

In [None]:
save_dir.split('/')[-1]

## Distributed Test

In [None]:
distributed_td3 = DistributedAgents(her.get_config(),
                                     num_workers=4,
                                     learner_device=None,
                                     learn_iter=16,
                                     log_level='debug')

# Actor Critic

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', "kaiming normal"),
    (256, 'relu', "kaiming normal"),
    ]



In [None]:
policy_model = models.PolicyModel(env=env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001,)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
value_model = models.ValueModel(env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001)

In [None]:
value_model

In [None]:
for params in value_model.parameters():
    print(params)

In [None]:
actor_critic = rl_agents.ActorCritic(env,
                                     policy_model,
                                     value_model,
                                     discount=0.99,
                                     policy_trace_decay=0.5,
                                     value_trace_decay=0.5,
                                     callbacks=[rl_callbacks.WandbCallback('CartPole-v1-Actor-Critic')])

In [None]:
actor_critic.train(200)

In [None]:
actor_critic.test(10, True, 1)

# REINFORCE

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', {
                    "kaiming normal": {
                        "a":1.0,
                        "mode":'fan_in'
                    }
                },
    ),
    # (256, 'relu', {
    #                 "kaiming_normal": {
    #                     "a":0.0,
    #                     "mode":'fan_in'
    #                 }
    #             },
    # )
    ]

In [None]:
dense_layers = [(128, 'relu', "kaiming normal")]

In [None]:
value_model = models.ValueModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in value_model.parameters():
    print(param)

In [None]:
policy_model = models.PolicyModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
reinforce = rl_agents.Reinforce(env, policy_model, value_model, 0.99, [rl_callbacks.WandbCallback('CartPole-v0_REINFORCE', chkpt_freq=100)])

In [None]:
reinforce.train(200, True, 50)

In [None]:
reinforce.test(10, True, 1)

# DDPG w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

In [None]:
cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
cnn

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env, cnn_model=cnn, dense_layers=dense_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=cnn, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape=(1,))
noise = helper.OUNoise(shape=env.action_space.shape, mean=0.0, theta=0.15, sigma=0.01, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(
    env,
    actor,
    critic,
    discount=0.98,
    tau=0.05,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    callbacks=[rl_callbacks.WandbCallback("CarRacing-v2")]
)

In [None]:
ddpg_agent.train(1000, True, 10)

In [None]:
wandb.finish()

In [None]:
wandb.login()

# HER

In [2]:
env = gym.make('FetchReach-v4')
env_spec = env.spec
env_wrap = GymnasiumWrapper(env_spec)

In [None]:
env_wrap.env_spec

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=None,
                          dense_layers=dense_layers,
                          goal_shape=(3,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.0001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=None,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(3,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.0001,
                            normalize=False)

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Reacher-v4')])

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=0.001,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.train(10, 50, 16, 40, True, 1000)

In [None]:
wandb.finish()

In [None]:
her.test(10, True, 1)

In [None]:
her.save()

In [None]:
her.agent.goal_normalizer.running_std

In [None]:
loaded_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
loaded_her.agent.replay_buffer.sample(10)

In [None]:
loaded_her.agent.state_normalizer.running_cnt

In [None]:
loaded_her.get_config()

In [None]:
loaded_her.test(10, True, 1)

In [None]:
10e4

# HER w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
_,_ = env.reset()

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal(env).shape

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=cnn,
                          dense_layers=dense_layers,
                          goal_shape=(1,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=cnn,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(1,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.001,
                            normalize=False)

In [None]:
critic

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('CarRacing-v2')])

In [None]:
ddpg_agent.actor_model

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=1,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.agent.actor_model

In [None]:
her.train(num_epochs=20,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=20
        )

In [None]:
her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/models/her")

In [None]:
wandb.finish()

In [None]:
# reset environment
state, _ = her.agent.env.reset()
# instantiate empty lists to store current episode trajectory
states, actions, next_states, dones, state_achieved_goals, \
next_state_achieved_goals, desired_goals = [], [], [], [], [], [], []
# set desired goal
desired_goal = her.desired_goal_func(her.agent.env)
# set achieved goal
state_achieved_goal = her.achieved_goal_func(her.agent.env)
# add initial state and goals to local normalizer stats
her.state_normalizer.update_local_stats(state)
her.goal_normalizer.update_local_stats(desired_goal)
her.goal_normalizer.update_local_stats(state_achieved_goal)
# set done flag
done = False
# reset episode reward to 0
episode_reward = 0
# reset steps counter for the episode
episode_steps = 0

while not done:
    # get normalized values for state and desired goal
    state_norm = her.state_normalizer.normalize(state)
    desired_goal_norm = her.goal_normalizer.normalize(desired_goal)
    # get action
    action = her.agent.get_action(state_norm, desired_goal_norm, grad=False)
    # take action
    next_state, reward, term, trunc, _ = her.agent.env.step(action)
    # get next state achieved goal
    next_state_achieved_goal = her.achieved_goal_func(her.agent.env)
    # add next state and next state achieved goal to normalizers
    her.state_normalizer.update_local_stats(next_state)
    her.goal_normalizer.update_local_stats(next_state_achieved_goal)
    # store trajectory in replay buffer (non normalized!)
    her.agent.replay_buffer.add(state, action, reward, next_state, done,\
                                    state_achieved_goal, next_state_achieved_goal, desired_goal)
    
    # append step state, action, next state, and goals to respective lists
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    dones.append(done)
    state_achieved_goals.append(state_achieved_goal)
    next_state_achieved_goals.append(next_state_achieved_goal)
    desired_goals.append(desired_goal)

    # add to episode reward and increment steps counter
    episode_reward += reward
    episode_steps += 1
    # update state and state achieved goal
    state = next_state
    state_achieved_goal = next_state_achieved_goal
    # update done flag
    if term or trunc:
        done = True

In [None]:
# package episode states, actions, next states, and goals into trajectory tuple
trajectory = (states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)

In [None]:
states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals = trajectory

In [None]:
for idx, (s, a, ns, d, sag, nsag, dg) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):
    print(f'a={a}, d={d}, sag={sag}, nsag={nsag}, dg={dg}')

In [None]:
strategy = "future"
num_goals = 4

# loop over each step in the trajectory to set new achieved goals, calculate new reward, and save to replay buffer
for idx, (state, action, next_state, done, state_achieved_goal, next_state_achieved_goal, desired_goal) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):

    if strategy == "final":
        new_desired_goal = next_state_achieved_goals[-1]
        new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
        print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
        her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)

    if strategy == 'future':
        for i in range(num_goals):
            if idx + i + 1 >= len(states):
                break
            goal_idx = np.random.randint(idx + 1, len(states))
            new_desired_goal = next_state_achieved_goals[goal_idx]
            new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
            print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
            her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)
    

    


In [None]:
s, a, r, ns, d, sag, nsag, dg = her.agent.replay_buffer.sample(100)

In [None]:
for i in range(100):
    print(f'{i}: a={a[i]}, r={r[i]}, d={d[i]}, sag={sag[i]}, nsag={nsag[i]}, dg={dg[i]} ')

# HER Pendulum

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.001, normalize=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, (3,))
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
def desired_goal_func(env):
    return np.array([0.0, 0.0, 0.0])

def achieved_goal_func(env):
    return env.get_wrapper_attr('_get_obs')()

def reward_func(env):
    pass

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='none',
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=10.0
)

In [None]:
her.agent.critic_model

In [None]:
her.agent.target_critic_model

In [None]:
her.train(1,1,100,1)

In [None]:
wandb.finish()

In [None]:
state = env.observation_space.sample()
state

In [None]:
her.agent.state_normalizer.normalize(state)

In [None]:
goal = her.desired_goal_func(her.agent.env)
goal

In [None]:
her.agent.goal_normalizer.normalize(goal)

In [None]:
def remove_renders(folder_path):
    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .mp4 or .meta.json extension
        if filename.endswith(".mp4") or filename.endswith(".meta.json"):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Remove the file
            os.remove(file_path)

In [None]:
remove_renders("/workspaces/RL_Agents/pytorch/src/app/assets/models/ddpg/renders/training")

# HER Fetch-Reach (Robotics)

In [None]:
env = gym.make("FetchReach-v3", max_episode_steps=50)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
achieved_goal_func(env)

In [None]:
env.get_wrapper_attr("_get_obs")()

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
goal_shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchReach-v2")])

In [None]:
ddpg_agent.critic_model

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

In [None]:
states, action, rewards, next_states, dones, achieved_goals, next_achieved_goals, desired_goals = her.agent.replay_buffer.sample(2)

In [None]:
desired_goals

In [None]:
her.agent.env.get_wrapper_attr("distance_threshold")

In [None]:
# get success
her.agent.env.get_wrapper_attr("_is_success")(achieved_goal_func(her.agent.env), desired_goal_func(her.agent.env))

In [None]:
her.agent.env.get_wrapper_attr("goal_distance")(next_state_achieved_goal, desired_goal, None)

In [None]:
pusher_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
pusher_her.agent.env.reset()

In [None]:
pusher_her.get_config()

In [None]:
wandb.finish()

In [None]:
np.linalg.norm(pusher_her.agent.env.get_wrapper_attr("get_body_com")("goal") - pusher_her.agent.env.get_wrapper_attr("get_body_com")("object"))

In [None]:
pusher_her.agent.replay_buffer.get_config()

In [None]:

pusher_her.agent.replay_buffer.desired_goals

In [None]:
## TEST ENV
env = gym.make("Pusher-v5", render_mode="rgb_array")

In [None]:
env = gym.wrappers.RecordVideo(
                    env,
                    "/renders/training",
                    episode_trigger=lambda x: True,
                )


In [None]:
state, _ = env.reset()

for i in range(1000):
# take action
    next_state, reward, term, trunc, _ = env.step(env.action_space.sample())
env.close()

# HER Fetch Push (Robitics)

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

# TESTING MULTITHREADING

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    num_workers=4,
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train()

# TESTING

In [None]:
# load config
config_path = "/workspaces/RL_Agents/pytorch/src/app/HER_Test/her/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

In [None]:
config

In [None]:
agent = rl_agents.HER.load(config)

In [None]:
for callback in agent.agent.callbacks:
    print(callback._sweep)

# Co Occurence

In [None]:
import subprocess

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/wandb_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    wandb_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(wandb_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Save the updated configuration to a train config file
os.makedirs('sweep', exist_ok=True)
train_config_path = os.path.join(os.getcwd(), 'sweep/train_config.json')
with open(train_config_path, 'w') as f:
    json.dump(sweep_config, f)

# Save and Set the sweep config path
sweep_config_path = os.path.join(os.getcwd(), 'sweep/sweep_config.json')
with open(sweep_config_path, 'w') as f:
    json.dump(wandb_config, f)

In [None]:
command = ['python', 'sweep.py']

# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

subprocess.Popen(command)

In [None]:
# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/train_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    train_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(train_config)

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, project=sweep_config["project"])
# loop over num wandb agents
num_agents = 1
# for agent in range(num_agents):
wandb.agent(
    sweep_id,
    function=lambda: wandb_support._run_sweep(sweep_config, train_config,),
    count=train_config['num_sweeps'],
    project=sweep_config["project"],
)

In [None]:
sweep_config

# PPO

In [None]:
from pathlib import Path
from typing import List, Tuple
import torch.nn.functional as F
from torch.distributions import Categorical, Beta, Normal, kl_divergence
import time
import cv2

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.1
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid1 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_1 = ppo_agent_hybrid1.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.01
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
## PARAMS ##
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
# env_id = 'BipedalWalker-v3'
env_id = 'Humanoid-v5'
# env_id = "Reacher-v5"
# env_id = "Walker2d-v5"
# env_id = 'ALE/SpaceInvaders-ram-v5'
# env_id = "CarRacing-v2"
# env_id = "BipedalWalkerHardcore-v3"

timesteps = 1_000_000
trajectory_length = 2000
batch_size = 64
learning_epochs = 10
num_envs = 16
policy_lr = 3e-4
value_lr = 2e-5
policy_clip = 0.2
entropy_coeff = 0.001
loss = 'hybrid'
kl_coeff = 0.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
grad_clip = 40.0
reward_clip = 1.0
lambda_ = 0.0
distribution = 'beta'
device = 'cuda'

# Render Settings
render_freq = 100

## WANDB ##
project_name = 'Humanoid-v5'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42
env = gym.make(env_id)

save_dir = 'Humanoid'
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed

# Build policy model
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
layer_config = [
    # {'type': 'cnn', 'params': {'out_channels': 32, 'kernel_size': (8, 8), 'stride': 4, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (4, 4), 'stride': 2, 'padding': 0}},
    # {'type': 'cnn', 'params': {'out_channels': 64, 'kernel_size': (3, 3), 'stride': 1, 'padding': 0}},
    # {'type': 'flatten'},
    {'type': 'dense', 'params': {'units': 128, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
]
output_layer_kernel = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticContinuousPolicy(env, layer_config, output_layer_kernel, learning_rate=policy_lr, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer_kernel, learning_rate=value_lr, device=device)
ppo = PPO(env, policy, value_function, distribution=distribution, discount=0.99, gae_coefficient=0.95, policy_clip=policy_clip, entropy_coefficient=entropy_coeff,
          loss=loss, kl_coefficient=kl_coeff, normalize_advantages=normalize_advantages, normalize_values=normalize_values, value_normalizer_clip=norm_clip, policy_grad_clip=grad_clip,
          reward_clip=reward_clip, lambda_=lambda_, callbacks=callbacks, save_dir=save_dir,device=device)
hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)
# ppo.test(10,"ppo_test", 1)


In [None]:
config_file_path = '/workspaces/RL_Agents/src/app/pong_v5_3/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

In [None]:
config['wrappers']

In [None]:
pong = PPO.load(config, False)

In [None]:
pong.env.env = pong.env._initialize_env(num_envs=2)

In [None]:
pong.env.action_space

In [None]:
num_envs = 2
action_shape = (3,1)
obs_shape = (3,)

observation_space = gym.spaces.Box(low=0, high=1, shape=(num_envs, *obs_shape))
action_space = gym.spaces.Box(low=0, high=1, shape=(num_envs, *action_shape)) if len(action_shape) > 1 else gym.spaces.MultiDiscrete([action_shape[0] for n in range(num_envs)])
single_observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape)
single_action_space = gym.spaces.Box(low=0, high=1, shape=action_shape) if len(action_shape) > 1 else gym.spaces.Discrete(action_shape[0])

In [None]:
action_space

In [None]:
single_obs = T.tensor(single_observation_space.sample())
state, info = (T.stack([single_obs for _ in range(observation_space.shape[0])]), {})

In [None]:
state

In [None]:
observation = T.stack([single_obs for _ in range(observation_space.shape[0])])
reward = T.zeros(observation_space.shape[0])
terminated = T.zeros(observation_space.shape[0], dtype=T.bool)
truncated = T.zeros(observation_space.shape[0], dtype=T.bool)
info = {}

In [None]:
vec_env = gym.make_vec("LunarLanderContinuous-v3", 2)

In [None]:
T.ones(vec_env.single_action_space.shape).dim()

In [None]:
from torch.distributions import Normal

num_envs = 2
expected_mu = T.stack([T.tensor([1.65, 1.65, 1.65]) for t in range(num_envs)])
expected_sigma = T.stack([T.tensor([3.9, 3.9, 3.9]) for t in range(num_envs)])
expected_dist = Normal(expected_mu, expected_sigma)

In [None]:
expected_dist.sample().shape

In [None]:
pong.train(2000000, 128, 32, 3, 12, 42)

In [None]:
scores = np.zeros(4)

In [None]:
scores[1] = 1
scores

In [None]:
import gymnasium.wrappers as base_wrappers

WRAPPER_REGISTRY = {
    "AtariPreprocessing": {
        "cls": base_wrappers.AtariPreprocessing,
        "default_params": {
            "frame_skip": 1,
            "grayscale_obs": True,
            "scale_obs": True
        }
    },
    "TimeLimit": {
        "cls": base_wrappers.TimeLimit,
        "default_params": {
            "max_episode_steps": 1000
        }
    },
    "TimeAwareObservation": {
        "cls": base_wrappers.TimeAwareObservation,
        "default_params": {
            "flatten": False,
            "normalize_time": False
        }
    },
    "FrameStackObservation": {
        "cls": base_wrappers.FrameStackObservation,
        "default_params": {
            "stack_size": 4
        }
    },
    "ResizeObservation": {
        "cls": base_wrappers.ResizeObservation,
        "default_params": {
            "shape": 84
        }
    }
}

In [None]:
wrappers = [
    {'type': "AtariPreprocessing", 'params': {'frame_skip':1, 'grayscale_obs':True, 'scale_obs':True}},
    {'type': "FrameStackObservation", 'params': {'stack_size':4}},
]

In [None]:
def wrap_env(vec_env, wrappers):
    wrapper_list = []
    for wrapper in wrappers:
        if wrapper['type'] in WRAPPER_REGISTRY:
            print(f'wrapper type:{wrapper["type"]}')
            # Use a copy of default_params to avoid modifying the registry
            default_params = WRAPPER_REGISTRY[wrapper['type']]["default_params"].copy()
            
            if wrapper['type'] == "ResizeObservation":
                # Ensure shape is a tuple for ResizeObservation
                default_params['shape'] = (default_params['shape'], default_params['shape']) if isinstance(default_params['shape'], int) else default_params['shape']
            
            print(f'default params:{default_params}')
            override_params = wrapper.get("params", {})
            
            if wrapper['type'] == "ResizeObservation":
                # Ensure override_params shape is a tuple
                if 'shape' in override_params:
                    override_params['shape'] = (override_params['shape'], override_params['shape']) if isinstance(override_params['shape'], int) else override_params['shape']
            
            print(f'override params:{override_params}')
            final_params = {**default_params, **override_params}
            print(f'final params:{final_params}')
            
            def wrapper_factory(env, cls=WRAPPER_REGISTRY[wrapper['type']]["cls"], params=final_params):
                return cls(env, **params)
            
            wrapper_list.append(wrapper_factory)
    
    # Define apply_wrappers outside the loop
    def apply_wrappers(env):
        for wrapper in wrapper_list:
            env = wrapper(env)
            print(f'length of obs space:{len(env.observation_space.shape)}')
            print(f'env obs space shape:{env.observation_space.shape}')
        return env
    
    print(f'wrapper list:{wrapper_list}')
    envs = [lambda: apply_wrappers(gym.make(vec_env.spec.id, render_mode="rgb_array")) for _ in range(vec_env.num_envs)]    
    return SyncVectorEnv(envs)

In [None]:
vec_env = gym.make_vec("ALE/Pong-v5", render_mode="rgb_array", num_envs=8)
wrapped_vec = wrap_env(vec_env, wrappers)

In [None]:
wrapped_vec.single_observation_space

In [None]:
for env in wrapped_vec.envs:
    print(env.spec)

In [None]:
def format_wrappers(wrapper_store):
    wrappers_dict = {}
    for key, value in wrapper_store.items():
        # Split the key into wrapper type and parameter name
        parts = key.split('_param:')
        print(f'parts:{parts}')
        wrapper_type = parts[0].split('wrapper:')[1]
        print(f'wrapper_type:{wrapper_type}')
        param_name = parts[1]
        print(f'param name:{param_name}')
        
        # If the wrapper type already exists in the dictionary, append to its params
        if wrapper_type not in wrappers_dict:
            wrappers_dict[wrapper_type] = {'type': wrapper_type, 'params': {}}
        
        wrappers_dict[wrapper_type]['params'][param_name] = value
    
    # Convert the dictionary to a list of dictionaries
    formatted_wrappers = list(wrappers_dict.values())
    
    return formatted_wrappers

In [None]:
wrapper_params = {'wrapper:AtariPreprocessing_param:frame_skip': 1, 'wrapper:AtariPreprocessing_param:grayscale_obs': True, 'wrapper:AtariPreprocessing_param:scale_obs': True, 'wrapper:FrameStackObservation_param:stack_size': 4}

In [None]:
formatted_wrappers = format_wrappers(wrapper_params)

In [None]:
formatted_wrappers

In [None]:
wrapper_params = {'wrapper:AtariPreprocessing_param:frame_skip': 1, 'wrapper:AtariPreprocessing_param:grayscale_obs': True, 'wrapper:AtariPreprocessing_param:scale_obs': True, 'wrapper:FrameStackObservation_param:stack_size': 4}
formatted_wrappers = dash_utils.format_wrappers(wrapper_params)
#DEBUG
print(f'formatted wrappers:{formatted_wrappers}')
env = dash_utils.instantiate_envwrapper_obj("gymnasium", "ALE/Pong-v5", formatted_wrappers)

In [None]:
config_file_path = '/workspaces/RL_Agents/src/app/humanoid_v5_2/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)
ppo = PPO.load(config, False)

In [None]:
ppo.get_config()

In [None]:
ppo.env.env = ppo.env._initialize_env(0, 8, 42)

In [None]:
for env in ppo.env.env.envs:
    print(env.spec.pprint)

In [None]:
ppo.get_config()

In [None]:
ppo.callbacks = []

In [None]:
ppo.train(2_000_000, 128, 64, 10, 8, 42, render_freq=100)

In [None]:
# states, _ = ppo.env.reset()
steps = 10
all_states = []
all_next_states = []
for step in range(steps):
    actions, log_probs = ppo.get_action(states)
    next_states, rewards, terms, truncs, infos = ppo.env.step(actions)
    all_states.append(states)
    all_next_states.append(next_states)
    states = next_states

In [None]:
for step, step_states in enumerate(all_states):
    print(f'step states shape:{step_states.shape}')
    for i in range(len(step_states)):
        for j in range(i + 1, len(step_states)):  # Compare each environment with others
            print(f'step state {i} shape:{step_states[i].shape}')
            print(f'step state {j} shape:{step_states[j].shape}')
            assert np.allclose(step_states[i], step_states[j]), f"Environments {i} and {j} differ at step {step}"

In [None]:
for i in range(len(all_states)):
    for j in range(i + 1, len(all_states)):  # Note the change here
        print(np.allclose(all_states[i], all_states[j]))

In [None]:
all_obs = []
obs = np.ones((8,1,84,84))
for _ in range(10):
    all_obs.append(obs)
# all_obs = np.array(all_obs)
all_obs = T.stack([T.tensor(s, dtype=T.float32) for s in all_obs])

In [None]:
all_obs.shape

In [None]:
action_space = gym.spaces.Box(low=0, high=1, shape=(2, 3))

In [None]:
np.all

In [None]:
all_advantages = []
all_returns = []
all_values = []
advantage = T.ones(128)
return_ = T.ones(128)
value = T.ones(128)
num_envs = 2

for _ in range(num_envs):
    all_advantages.append(advantage)
    all_returns.append(return_)
    all_values.append(value)

advantages = T.stack(all_advantages, dim=1)
returns = T.stack(all_returns, dim=1)
values = T.stack(all_values, dim=1)

In [None]:
advantages.shape

In [None]:
states, _ = pong.env.reset()
states.shape

In [None]:
ns, r, term, trunc, _ = pong.env.step(pong.env.action_space.sample())

In [None]:
r.shape

In [None]:
pong.env.single_observation_space.shape

In [None]:
pong.env.observation_space.shape

In [None]:
pong.env.env.envs[0].spec

In [None]:
states, _ = pong.env.reset()
states = T.tensor(states)
dist, _ = pong.policy_model(states)
sample = dist.sample()
sample.shape

In [None]:
pong.policy_model

In [None]:
pong.env.reset()

In [None]:
def clip_reward(reward):
    """
    Clip rewards to the specified range.

    Args:
        reward (float): Reward to clip.

    Returns:
        float: Clipped reward.
    """
    if reward > 1:
        return 1
    elif reward < -1:
        return -1
    else:
        return reward

In [None]:
env = gym.make_vec("ALE/Pong-v5", 1)

In [None]:
states, _ = env.reset()

In [None]:
all_rewards = []
all_dones = []
for _ in range(10):
    next_states, rewards, terms, truncs, infos = env.step(env.action_space.sample())
    all_rewards.append(rewards)
    all_dones.append(np.logical_or(terms, truncs))
rewards = T.stack([T.tensor(r, dtype=T.float32) for r in all_rewards])
dones = T.stack([T.tensor(d, dtype=T.float32) for d in all_dones])

In [None]:
dones.shape

In [None]:
rewards[:,0].shape

In [None]:
[clip_reward(reward) for reward in rewards]

In [None]:
T_max = 6000  # Total steps
eta_max = 1.0  # Initial noise stddev
eta_min = 0.1  # Minimum noise stddev

t = np.linspace(0, T_max, 1000)  # Sample points
value = eta_min + 0.5 * (eta_max - eta_min) * (1 + np.cos(t * np.pi / T_max))

plt.figure(figsize=(10, 6))
plt.plot(t, value, 'b-', label='Cosine Annealing (stddev)')
plt.axhline(y=eta_max, color='r', linestyle='--', label='Initial (1.0)')
plt.axhline(y=eta_min, color='g', linestyle='--', label='Minimum (0.1)')
plt.xlabel('Steps')
plt.ylabel('Noise StdDev')
plt.title('Cosine Annealing Curve for Noise (stddev)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import sys
print(sys.path)  # Shows all directories Python checks for imports

# Try to find mcp specifically
try:
    import mcp
    print(f"MCP found at: {mcp.__file__}")
except ImportError:
    print("MCP not found")

In [None]:
import mcp
print(dir(mcp))  # This will show all available attributes/modules in mcp

# Distributed Training (Tune)

In [2]:
import ray

In [None]:
ray.is_initialized()