In [None]:
import numpy as np
import numba
import umap
import pynndescent

print("NumPy version:", np.__version__)
print("Numba version:", numba.__version__)
print("UMAP version:", umap.__version__)
print("PyNNDescent version:", pynndescent.__version__)


In [1]:
import os
import json
import ale_py

import torch as T
import torch.nn as nn
from torch import optim
import numpy as np
# import pandas as pd
# from umap import UMAP


import torch_utils
from torch import distributions
T.autograd.set_detect_anomaly(True)

import gymnasium as gym
import gymnasium_robotics as gym_robo
# import models
from models import ValueModel, StochasticContinuousPolicy, ActorModel, StochasticDiscretePolicy, CriticModel
import cnn_models
from rl_agents import PPO, Reinforce, ActorCritic, DDPG#, HER,  TD3,
import rl_callbacks
from rl_callbacks import WandbCallback
# from helper import Normalizer
import gym_helper
import wandb_support
import wandb
from env_wrapper import EnvWrapper, GymnasiumWrapper, IsaacSimWrapper, atari_wrappers
from dash_utils import get_wrappers_dropdown_options
from schedulers import ScheduleWrapper
from adaptive_kl import AdaptiveKL
from noise import *
from buffer import ReplayBuffer

error: XDG_RUNTIME_DIR is invalid or not set in the environment.


In [None]:
import mujoco

In [None]:
mujoco.MjModel

In [None]:
gym_robo.__version__

In [None]:
print(f"PyTorch version: {T.__version__}")
print(f"CUDA version: {T.version.cuda}")
print(f"cuDNN version: {T.backends.cudnn.version()}")

In [None]:
def check_cuda():
    cuda_available = T.cuda.is_available()
    if cuda_available:
        print("CUDA is available.")
        num_gpus = T.cuda.device_count()
        print(f"Number of GPUs detected: {num_gpus}")
        
        for i in range(num_gpus):
            gpu_name = T.cuda.get_device_name(i)
            gpu_memory = T.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # Convert bytes to GB
            print(f"GPU {i}: {gpu_name}")
            print(f"Total memory: {gpu_memory:.2f} GB")
    else:
        print("CUDA is not available.")

check_cuda()

In [None]:
def get_default_device():
    """Returns the default device for computations, GPU if available, otherwise CPU"""
    if T.cuda.is_available():
        return T.device('cuda')
    else:
        return T.device('cpu')

device = get_default_device()
print(f"Using device: {device}")

# TEST

In [None]:
# ENV config
env_id = "Pendulum-v1"
gym_env = gym.make(env_id)
env = GymnasiumWrapper(gym_env.spec)
env.env = env._initialize_env(0, 4, 42)

In [None]:
env.action_space.shape

In [None]:
actions = np.ones(4)
actions.shape

In [None]:
actions = env.format_actions(actions)
actions.shape

In [None]:
if isinstance(env.action_space, gym.spaces.Box):
    print(f'box shape {env.action_space.shape}')
    
elif isinstance(env.action_space, gym.spaces.Discrete):
    print(f'discrete shape {env.action_space.shape}')
elif isinstance(env.action_space, gym.spaces.MultiDiscrete):
    print(f'multi discrete shape {env.action_space.shape}')

In [None]:
env.action_space.shape[-1]

In [None]:
env.step(env.action_space.sample())

In [None]:
gym_robo.register_robotics_envs()

In [None]:
gym.envs.registration.registry

In [None]:
wandb.login(key='758ac5ba01e12a3df504d2db2fec8ba4f391f7e6')

In [None]:
env = gym.make('FetchPush-v2', max_episode_steps=100, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(env, 'test/', episode_trigger=lambda i: i%1==0)

# episodes = 10


# for episode in range(episodes):
#     done = False
#     obs, _ = env.reset()
#     while not done:
#         obs, r, term, trunc, dict = env.step(env.action_space.sample())
#         if term or trunc:
#             done = True
# env.close()

In [None]:
env = gym.make("FetchReach-v2")
env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

# The following always has to hold:
assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
assert truncated == env.compute_truncated(obs["achieved_goal"], obs["desired_goal"], info)
assert terminated == env.compute_terminated(obs["achieved_goal"], obs["desired_goal"], info)

In [None]:
env.compute_reward()

In [None]:
env = gym.make('FetchPush-v3', render_mode='rgb_array')

In [None]:
env.observation_space

In [None]:
if hasattr(env, "distance_threshold"):
    print('true')
else:
    print('false')

In [None]:
if env.get_wrapper_attr("distance_threshold"):
    print('true')

In [None]:
print(dir(env))


# DDPG

In [None]:
# ENV config
env_id = "Pendulum-v1"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
env = GymnasiumWrapper(gym_env.spec)

In [None]:
env.single_action_space.high

In [2]:
# ENV config
env_id = "Pendulum-v1"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
env = GymnasiumWrapper(gym_env.spec)

# config
num_episodes = 100
num_envs = 1
seed = 42
policy_lr = 1e-3
value_lr = 2e-3
policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}
# lr_scheduler = {'type':'LinearLR', 'params':{'start_factor':1.0, 'end_factor':0.04, 'total_iters':1500, 'last_epoch':-1}}
lr_scheduler = None
normalize_inputs = True
normalizer_clip = 5
normalizer_eps = 0.01
warmup = 500
discount = 0.99
tau = 0.005
action_epsilon = 0.2
batch_size = 64
replay_buffer = ReplayBuffer(env, 100000)
# noise = OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = NormalNoise(env.action_space.shape, mean=0, stddev=0.1, device='cuda')

# Render Settings
render_freq = 20

## WANDB ##
project_name = 'Pendulum-v1'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []
save_dir = "Pendulum_v1"


# Model config
policy_config = [
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
]

state_config = [
    {'type': 'dense', 'params': {'units': 400, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
]

merged_config = [
    {'type': 'dense', 'params': {'units': 300, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
]

actor_output_layer = [{'type': 'dense', 'params': {'kernel': 'uniform', 'kernel params':{'a':-3e-3, 'b':3e-3}}}]
critic_output_layer = [{'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}}]

actor = ActorModel(env, policy_config, actor_output_layer, policy_optimizer, lr_scheduler)
critic = CriticModel(env, state_config, merged_config, critic_output_layer, value_optimizer, lr_scheduler)
ddpg = DDPG(env, actor, critic, replay_buffer, discount, tau, action_epsilon, batch_size, noise, normalize_inputs, normalizer_clip, normalizer_eps, warmup, callbacks, save_dir)

In [3]:
ddpg.train(num_episodes, num_envs, seed, render_freq)

[34m[1mwandb[0m: Currently logged in as: [33mjasonhayes1987[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


episode 1, score -1353.5256291361895, avg_score -1353.5256291361895
episode 2, score -1471.1258760330727, avg_score -1412.3257525846311
episode 3, score -1346.9204148688123, avg_score -1390.5239733460248
episode 4, score -1430.423361534624, avg_score -1400.4988203931746
episode 5, score -1599.2003740219388, avg_score -1440.2391311189274
episode 6, score -1375.825803633311, avg_score -1429.5035765379914
episode 7, score -1352.2117236123593, avg_score -1418.461883262901
episode 8, score -657.6990653129777, avg_score -1323.3665310191607
episode 9, score -684.5134870166377, avg_score -1252.3828594633248
episode 10, score -1532.5198119155143, avg_score -1280.3965547085438
episode 11, score -797.0933273488246, avg_score -1236.459897675842
episode 12, score -1226.1644176216507, avg_score -1235.6019410046595
episode 13, score -1042.7974579682377, avg_score -1220.7708269249347
episode 14, score -793.7640168599983, avg_score -1190.270340491725
episode 15, score -781.1665799428989, avg_score -116

                                                               

Moviepy - Done !
Moviepy - video ready Pendulum_v1/ddpg/renders/train/episode_20.0.mp4
episode rendered
episode 20, score -772.4356384504451, avg_score -1019.7063641168312




episode 21, score -7.553236186178613, avg_score -971.5085961201333
episode 22, score -259.04370816315907, avg_score -939.1238284857255
episode 23, score -249.00614119376144, avg_score -909.1187116469445
episode 24, score -254.97445726443286, avg_score -881.8627010476731
episode 25, score -265.53039128799026, avg_score -857.2094086572858
episode 26, score -117.6072449866042, avg_score -828.7631715930288
episode 27, score -385.74491833912384, avg_score -812.3550881391806
episode 28, score -253.0809798503406, avg_score -792.3810128431505
episode 29, score -3.1767333701666507, avg_score -765.1670721716683
episode 30, score -371.12065576337665, avg_score -752.0321916247253
episode 31, score -134.9671583071255, avg_score -732.1268679693189
episode 32, score -259.123826276981, avg_score -717.3455229164333
episode 33, score -384.28264532315694, avg_score -707.2527084439098
episode 34, score -128.348569826949, avg_score -690.2261161316463
episode 35, score -494.0677127768903, avg_score -684.621

error: XDG_RUNTIME_DIR is invalid or not set in the environment.


rendering episode...
Moviepy - Building video Pendulum_v1/ddpg/renders/train/episode_40.0.mp4.
Moviepy - Writing video Pendulum_v1/ddpg/renders/train/episode_40.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Pendulum_v1/ddpg/renders/train/episode_40.0.mp4
episode rendered
episode 40, score -263.6485354685722, avg_score -637.8188843872788




episode 41, score -137.8685335857538, avg_score -625.6249733921196
episode 42, score -512.0003476678049, avg_score -622.9196251605883
episode 43, score -482.95150229468544, avg_score -619.6645525358
episode 44, score -253.09581708220924, avg_score -611.3334449118547
episode 45, score -376.0733605281595, avg_score -606.1054430366615
episode 46, score -381.33020786470803, avg_score -601.2190248807494
episode 47, score -395.5527745097055, avg_score -596.8431472132804
episode 48, score -133.3266794182599, avg_score -587.1865541342175
episode 49, score -254.5543858837453, avg_score -580.3981425372691
episode 50, score -387.8785394764892, avg_score -576.5477504760535
episode 51, score -388.75680689152136, avg_score -572.8655751116509
episode 52, score -127.13188258122213, avg_score -564.2937733322195
episode 53, score -384.2710774333548, avg_score -560.8971186926183
episode 54, score -258.92790444845775, avg_score -555.3050962066153
episode 55, score -264.8639523115742, avg_score -550.024348

error: XDG_RUNTIME_DIR is invalid or not set in the environment.


rendering episode...
Moviepy - Building video Pendulum_v1/ddpg/renders/train/episode_60.0.mp4.
Moviepy - Writing video Pendulum_v1/ddpg/renders/train/episode_60.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Pendulum_v1/ddpg/renders/train/episode_60.0.mp4
episode rendered
episode 60, score -387.2006546790449, avg_score -527.7234012631241




episode 61, score -130.50215800263285, avg_score -521.211577603116
episode 62, score -365.31445561845027, avg_score -518.6971078936859
episode 63, score -493.14076284114515, avg_score -518.2914516230106
episode 64, score -378.2995423369917, avg_score -516.1040780404165
episode 65, score -488.90759214812584, avg_score -515.6856705651505
episode 66, score -377.0727843383171, avg_score -513.5854753192893
episode 67, score -261.39457760919385, avg_score -509.82143206988496
episode 68, score -241.13853825365408, avg_score -505.8702130431757
episode 69, score -1778.4393114610082, avg_score -524.3132434550283
episode 70, score -385.85134002027803, avg_score -522.3352162631033
episode 71, score -133.41244683312982, avg_score -516.857430778174
episode 72, score -1868.7615105178252, avg_score -535.6338763301137
episode 73, score -151.39621990490548, avg_score -530.3703467900424
episode 74, score -385.3485467253876, avg_score -528.4105927351145
episode 75, score -387.9107146966972, avg_score -526

error: XDG_RUNTIME_DIR is invalid or not set in the environment.


rendering episode...
Moviepy - Building video Pendulum_v1/ddpg/renders/train/episode_80.0.mp4.
Moviepy - Writing video Pendulum_v1/ddpg/renders/train/episode_80.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Pendulum_v1/ddpg/renders/train/episode_80.0.mp4
episode rendered
episode 80, score -507.5169180136044, avg_score -519.8751304103298




episode 81, score -517.9094822715302, avg_score -519.8508631493569
episode 82, score -495.6640826312069, avg_score -519.5559024113307
episode 83, score -261.5628523609577, avg_score -516.4475524107238
episode 84, score -491.3117680445563, avg_score -516.1483168825552
episode 85, score -383.1740604737153, avg_score -514.5839138659805
episode 86, score -131.22508843510082, avg_score -510.12625310515637
episode 87, score -384.2408456394426, avg_score -508.6792943986539
episode 88, score -483.7353687536135, avg_score -508.3958406981421
episode 89, score -272.4291501698707, avg_score -505.7445295686109
episode 90, score -379.6416778307314, avg_score -504.34338677152334
episode 91, score -367.16281071090134, avg_score -502.8359079137143
episode 92, score -258.45227904267017, avg_score -500.1795641216377
episode 93, score -266.0451040819499, avg_score -497.66198928250134
episode 94, score -384.3631271123188, avg_score -496.45668223813766
episode 95, score -382.55308291035885, avg_score -495.2

error: XDG_RUNTIME_DIR is invalid or not set in the environment.


rendering episode...
Moviepy - Building video Pendulum_v1/ddpg/renders/train/episode_100.0.mp4.
Moviepy - Writing video Pendulum_v1/ddpg/renders/train/episode_100.0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready Pendulum_v1/ddpg/renders/train/episode_100.0.mp4
episode rendered
episode 100, score -500.9187137284398, avg_score -511.91605844755327


0,1
action_0_noise,▅▆▆▆▂▃▇▅▁▃▅▆▃▆▁▇▅▅▅▆▄█▃▅▅▇▅▄▆▅▅▅█▅▅▂▅▂▅▃
actor_loss,▁▁▂▄▄▅▆▆▆▆▆▇██▆▇▇▇▆▇▇▆▇▆▆▇▅▆▆▆▆▅▆▅▅▆▅▅▄▆
actor_predictions,█▃▃▁▂▄▄▄▄▂▃▄▄▃▆▃▂▄▃▄▂▄▄▃▃▄▄▄▅▃▄▃▃▃▅▃▄▃▄▂
best,▁▁▁▁████████████████████████▁▁▁▁▁▁▁▁▁██▁
critic_loss,▁▁▁▁▁▁▁▁▁▃▁▁▅▂▁▁▁▄▂▁▁▁▅▁▃▃▂▂▁█▅▂▂▅▂▁▂▂▁▁
critic_predictions,██▇▂▄▁▃▂▃▂▅▅▂▂▃▇▅▂▃▄▆▄▄▆▇▄▄▄▅▄▃▆▇▇▃▅▁▆▆▃
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
episode_reward,▃▂▃▆▅▅▆▇█▇▇▇▇██▇▇▇▇▇▇▇▇▆▇▇▇▁▇▇▆▆▇▇▇▇▇▇▇▆
step_reward,▆▆▁▅█▇█▅▅████▆█▁▂▆███▆███▇█▃▃█▇████▇████
step_rewards,▆▆▆▆▆▆▆▆▆▆▃▃▃▃▃▃▃▃▃▃▃▃▃██████████▁▁▁▁▁▁▁

0,1
action_0_noise,-0.04666
actor_loss,61.55302
actor_predictions,0.43935
best,0.0
critic_loss,170.22952
critic_predictions,-61.55302
episode,100.0
episode_reward,-500.91871
step_reward,-0.64756
step_rewards,-0.04941


In [None]:
ddpg.test(10, 1, 42, 0)

In [None]:
# load config
config_path = "/workspaces/RL_Agents/src/app/Pendulum_v1/ddpg/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)
# print(config)
ddpg = DDPG.load(config, True)

In [None]:
ddpg.get_config()

# Actor Critic

In [None]:
env_id = "CartPole-v1"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
env = GymnasiumWrapper(gym_env.spec)

In [None]:
env.env_spec

In [None]:
# ENV config
env_id = "CartPole-v1"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
env = GymnasiumWrapper(gym_env.spec)

# Reinforce config
num_episodes = 500
num_envs = 1
seed = 42
policy_lr = 5e-4
value_lr = 5e-4
policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}
policy_trace_decay = 0.0
value_trace_decay = 0.0
discount = 0.99

# Render Settings
render_freq = 0

## WANDB ##
project_name = 'CartPole-v1'
run_name = None
# callbacks = [WandbCallback(project_name, run_name)]
callbacks = []
save_dir = "CartPole_v1"


# Model config
layer_config = [
    {'type': 'dense', 'params': {'units': 128, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 256, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},

policy_model = StochasticDiscretePolicy(env, layer_config, output_layer, policy_optimizer)
value_model = ValueModel(env, layer_config, output_layer, value_optimizer)
ac = ActorCritic(env, policy_model, value_model, discount, policy_trace_decay, value_trace_decay, callbacks, save_dir)

ac.train(num_episodes, num_envs, seed, render_freq)

# REINFORCE

In [None]:
# ENV config
env_id = "CartPole-v1"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
env = GymnasiumWrapper(gym_env.spec)

# Reinforce config
num_episodes = 200
num_envs = 1
trajectories_per_update = 1
seed = 42
policy_lr = 1e-3
value_lr = 1e-3
policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}

# Render Settings
render_freq = 20

## WANDB ##
project_name = 'CartPole-v1'
run_name = None
# callbacks = [WandbCallback(project_name, run_name)]
callbacks = []
save_dir = "CartPole_v1"


# Model config
layer_config = [
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'},
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},

policy_model = StochasticDiscretePolicy(env, layer_config, output_layer, policy_optimizer)
value_model = ValueModel(env, layer_config, output_layer, value_optimizer)
reinforce = Reinforce(env, policy_model, value_model, callbacks=callbacks, save_dir=save_dir)

reinforce.train(num_episodes, num_envs, trajectories_per_update, seed, render_freq)

In [None]:
# load config
config_path = "/workspaces/RL_Agents/src/app/reinforce_test/reinforce/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

loaded_reinforce = Reinforce.load(config, False)

In [None]:
loaded_reinforce.get_config()

In [None]:
loaded_reinforce.train(500, 1, 1, 42, 20)

# DDPG w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

In [None]:
cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
cnn

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env, cnn_model=cnn, dense_layers=dense_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=cnn, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape=(1,))
noise = helper.OUNoise(shape=env.action_space.shape, mean=0.0, theta=0.15, sigma=0.01, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(
    env,
    actor,
    critic,
    discount=0.98,
    tau=0.05,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    callbacks=[rl_callbacks.WandbCallback("CarRacing-v2")]
)

In [None]:
ddpg_agent.train(1000, True, 10)

In [None]:
wandb.finish()

In [None]:
wandb.login()

# HER

In [None]:
env = gym.make("Reacher-v4")

In [None]:
_,_ = env.reset()

In [None]:
achieved_goal = gym_helper.reacher_achieved_goal(env)
action = env.action_space.sample()
env.step(action)
print(f'observation: {env.get_wrapper_attr("_get_obs")()}')
print(f'distance to goal: {env.get_wrapper_attr("_get_obs")()[8::]}')
print(f'fingertip: {env.get_wrapper_attr("get_body_com")("fingertip")}')
print(f'target: {env.get_wrapper_attr("get_body_com")("target")}')

In [None]:
next_achieved_goal = env.get_wrapper_attr("_get_obs")()[8::]
desired_goal = [0.0, 0.0, 0.0]

In [None]:
reward_func(env, action, achieved_goal, next_achieved_goal, desired_goal, 0.05)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=None,
                          dense_layers=dense_layers,
                          goal_shape=(3,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.0001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=None,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(3,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.0001,
                            normalize=False)

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Reacher-v4')])

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=0.001,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.train(10, 50, 16, 40, True, 1000)

In [None]:
wandb.finish()

In [None]:
her.test(10, True, 1)

In [None]:
her.save()

In [None]:
her.agent.goal_normalizer.running_std

In [None]:
loaded_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
loaded_her.agent.replay_buffer.sample(10)

In [None]:
loaded_her.agent.state_normalizer.running_cnt

In [None]:
loaded_her.get_config()

In [None]:
loaded_her.test(10, True, 1)

In [None]:
10e4

# HER w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
_,_ = env.reset()

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal(env).shape

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=cnn,
                          dense_layers=dense_layers,
                          goal_shape=(1,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=cnn,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(1,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.001,
                            normalize=False)

In [None]:
critic

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('CarRacing-v2')])

In [None]:
ddpg_agent.actor_model

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=1,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.agent.actor_model

In [None]:
her.train(num_epochs=20,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=20
        )

In [None]:
her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/models/her")

In [None]:
wandb.finish()

In [None]:
# reset environment
state, _ = her.agent.env.reset()
# instantiate empty lists to store current episode trajectory
states, actions, next_states, dones, state_achieved_goals, \
next_state_achieved_goals, desired_goals = [], [], [], [], [], [], []
# set desired goal
desired_goal = her.desired_goal_func(her.agent.env)
# set achieved goal
state_achieved_goal = her.achieved_goal_func(her.agent.env)
# add initial state and goals to local normalizer stats
her.state_normalizer.update_local_stats(state)
her.goal_normalizer.update_local_stats(desired_goal)
her.goal_normalizer.update_local_stats(state_achieved_goal)
# set done flag
done = False
# reset episode reward to 0
episode_reward = 0
# reset steps counter for the episode
episode_steps = 0

while not done:
    # get normalized values for state and desired goal
    state_norm = her.state_normalizer.normalize(state)
    desired_goal_norm = her.goal_normalizer.normalize(desired_goal)
    # get action
    action = her.agent.get_action(state_norm, desired_goal_norm, grad=False)
    # take action
    next_state, reward, term, trunc, _ = her.agent.env.step(action)
    # get next state achieved goal
    next_state_achieved_goal = her.achieved_goal_func(her.agent.env)
    # add next state and next state achieved goal to normalizers
    her.state_normalizer.update_local_stats(next_state)
    her.goal_normalizer.update_local_stats(next_state_achieved_goal)
    # store trajectory in replay buffer (non normalized!)
    her.agent.replay_buffer.add(state, action, reward, next_state, done,\
                                    state_achieved_goal, next_state_achieved_goal, desired_goal)
    
    # append step state, action, next state, and goals to respective lists
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    dones.append(done)
    state_achieved_goals.append(state_achieved_goal)
    next_state_achieved_goals.append(next_state_achieved_goal)
    desired_goals.append(desired_goal)

    # add to episode reward and increment steps counter
    episode_reward += reward
    episode_steps += 1
    # update state and state achieved goal
    state = next_state
    state_achieved_goal = next_state_achieved_goal
    # update done flag
    if term or trunc:
        done = True

In [None]:
# package episode states, actions, next states, and goals into trajectory tuple
trajectory = (states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)

In [None]:
states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals = trajectory

In [None]:
for idx, (s, a, ns, d, sag, nsag, dg) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):
    print(f'a={a}, d={d}, sag={sag}, nsag={nsag}, dg={dg}')

In [None]:
strategy = "future"
num_goals = 4

# loop over each step in the trajectory to set new achieved goals, calculate new reward, and save to replay buffer
for idx, (state, action, next_state, done, state_achieved_goal, next_state_achieved_goal, desired_goal) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):

    if strategy == "final":
        new_desired_goal = next_state_achieved_goals[-1]
        new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
        print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
        her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)

    if strategy == 'future':
        for i in range(num_goals):
            if idx + i + 1 >= len(states):
                break
            goal_idx = np.random.randint(idx + 1, len(states))
            new_desired_goal = next_state_achieved_goals[goal_idx]
            new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
            print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
            her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)
    

    


In [None]:
s, a, r, ns, d, sag, nsag, dg = her.agent.replay_buffer.sample(100)

In [None]:
for i in range(100):
    print(f'{i}: a={a[i]}, r={r[i]}, d={d[i]}, sag={sag[i]}, nsag={nsag[i]}, dg={dg[i]} ')

# HER Pendulum

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.001, normalize=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, (3,))
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
def desired_goal_func(env):
    return np.array([0.0, 0.0, 0.0])

def achieved_goal_func(env):
    return env.get_wrapper_attr('_get_obs')()

def reward_func(env):
    pass

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='none',
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=10.0
)

In [None]:
her.agent.critic_model

In [None]:
her.agent.target_critic_model

In [None]:
her.train(1,1,100,1)

In [None]:
wandb.finish()

In [None]:
state = env.observation_space.sample()
state

In [None]:
her.agent.state_normalizer.normalize(state)

In [None]:
goal = her.desired_goal_func(her.agent.env)
goal

In [None]:
her.agent.goal_normalizer.normalize(goal)

In [None]:
def remove_renders(folder_path):
    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .mp4 or .meta.json extension
        if filename.endswith(".mp4") or filename.endswith(".meta.json"):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Remove the file
            os.remove(file_path)

In [None]:
remove_renders("/workspaces/RL_Agents/pytorch/src/app/assets/models/ddpg/renders/training")

# HER Fetch-Reach (Robotics)

In [None]:
env = gym.make("FetchReach-v2", max_episode_steps=50)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
achieved_goal_func(env)

In [None]:
env.get_wrapper_attr("_get_obs")()

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
goal_shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchReach-v2")])

In [None]:
ddpg_agent.critic_model

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

In [None]:
states, action, rewards, next_states, dones, achieved_goals, next_achieved_goals, desired_goals = her.agent.replay_buffer.sample(2)

In [None]:
desired_goals

In [None]:
her.agent.env.get_wrapper_attr("distance_threshold")

In [None]:
# get success
her.agent.env.get_wrapper_attr("_is_success")(achieved_goal_func(her.agent.env), desired_goal_func(her.agent.env))

In [None]:
her.agent.env.get_wrapper_attr("goal_distance")(next_state_achieved_goal, desired_goal, None)

In [None]:
pusher_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
pusher_her.agent.env.reset()

In [None]:
pusher_her.get_config()

In [None]:
wandb.finish()

In [None]:
np.linalg.norm(pusher_her.agent.env.get_wrapper_attr("get_body_com")("goal") - pusher_her.agent.env.get_wrapper_attr("get_body_com")("object"))

In [None]:
pusher_her.agent.replay_buffer.get_config()

In [None]:

pusher_her.agent.replay_buffer.desired_goals

In [None]:
## TEST ENV
env = gym.make("Pusher-v5", render_mode="rgb_array")

In [None]:
env = gym.wrappers.RecordVideo(
                    env,
                    "/renders/training",
                    episode_trigger=lambda x: True,
                )


In [None]:
state, _ = env.reset()

for i in range(1000):
# take action
    next_state, reward, term, trunc, _ = env.step(env.action_space.sample())
env.close()

# HER Fetch Push (Robitics)

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

# TESTING MULTITHREADING

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    num_workers=4,
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train()

# TESTING

In [None]:
# load config
config_path = "/workspaces/RL_Agents/pytorch/src/app/HER_Test/her/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

In [None]:
config

In [None]:
agent = rl_agents.HER.load(config)

In [None]:
for callback in agent.agent.callbacks:
    print(callback._sweep)

# Co Occurence

In [None]:
import subprocess

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/wandb_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    wandb_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(wandb_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Save the updated configuration to a train config file
os.makedirs('sweep', exist_ok=True)
train_config_path = os.path.join(os.getcwd(), 'sweep/train_config.json')
with open(train_config_path, 'w') as f:
    json.dump(sweep_config, f)

# Save and Set the sweep config path
sweep_config_path = os.path.join(os.getcwd(), 'sweep/sweep_config.json')
with open(sweep_config_path, 'w') as f:
    json.dump(wandb_config, f)

In [None]:
command = ['python', 'sweep.py']

# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

subprocess.Popen(command)

In [None]:
# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/train_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    train_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(train_config)

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, project=sweep_config["project"])
# loop over num wandb agents
num_agents = 1
# for agent in range(num_agents):
wandb.agent(
    sweep_id,
    function=lambda: wandb_support._run_sweep(sweep_config, train_config,),
    count=train_config['num_sweeps'],
    project=sweep_config["project"],
)

In [None]:
sweep_config

# PPO

In [None]:
from pathlib import Path
from typing import List, Tuple
import torch.nn.functional as F
from torch.distributions import Categorical, Beta, Normal, kl_divergence
import time
import cv2

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.1
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid1 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_1 = ppo_agent_hybrid1.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

In [None]:
# PARAMS
# env_id = 'Pendulum-v1'
# env_id = 'LunarLanderContinuous-v3'
env_id = 'BipedalWalker-v3'
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.1
kl_coeff = 0.01
loss = 'kl'
timesteps = 100_000
num_envs = 10
device = 'cuda'

seed = 42
env = gym.make_vec(env_id, num_envs)
# env = gym.make('BipedalWalker-v3')
# _,_ = env.reset()
# sample = env.action_space.sample()
# if isinstance(sample, np.int64) or isinstance(sample, np.int32):
#     print(f'discrete action space of size {env.action_space.n}')
# elif isinstance(sample, np.ndarray):
#     print(f'continuous action space of size {env.action_space.shape}')

T.manual_seed(seed)
T.cuda.manual_seed(seed)
np.random.seed(seed)
gym.utils.seeding.np_random.seed = seed
# Build policy model
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
policy = StochasticContinuousPolicy(env, num_envs, dense_layers, learning_rate=policy_lr, distribution='Beta', device=device)
dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
value_function = ValueModel(env, dense_layers, learning_rate=value_lr, device=device)
ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=640, learning_epochs=10, num_envs=num_envs)

# seed = 43
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid2 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_2 = ppo_agent_hybrid2.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)

# seed = 44
# env = gym.make(env_id)
# T.manual_seed(seed)
# T.cuda.manual_seed(seed)
# np.random.seed(seed)
# gym.utils.seeding.np_random.seed = seed
# # Build policy model
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# policy = StochasticContinuousPolicy(env, dense_layers, learning_rate=3e-4)
# dense_layers = [(128,"tanh",{"default":{}}),(128,"tanh",{"default":{}})]
# value_function = ValueModel(env, dense_layers, learning_rate=3e-4)
# ppo_agent_hybrid3 = PPO(env, policy, value_function, distribution='Beta', discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff, kl_coefficient=kl_coeff, loss=loss)
# hybrid_train_info_3 = ppo_agent_hybrid3.train(timesteps=timesteps, trajectory_length=2048, batch_size=64, learning_epochs=10)
# hybrid_test_info = ppo_agent_hybrid.test(1000, 'PPO_hybrid', 100)

Stochastic Continuous

In [None]:
# Instantiate EnvWrapper for Gymnasium env
env_id = 'InvertedPendulum-v5'

gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
print(f'env spec:{env_spec}')

env = GymnasiumWrapper(env_spec)

In [None]:
env = env._initialize_env(env_spec, 0, 50, 42)

In [None]:
env.observation_space

In [None]:
## PARAMS ##

env_id = "LunarLanderContinuous-v3"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
print(f'env spec:{env_spec}')


env = GymnasiumWrapper(gym_env.spec)

timesteps = 1_000_000
trajectory_length = 2000
batch_size = 64
learning_epochs = 20
num_envs = 1
policy_lr = 3e-4
value_lr = 3e-4
discount = 0.99
gae_coefficient = 0.95
policy_clip = 1e8
entropy_coeff = 0.001
kl_coeff = 3.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
policy_grad_clip = 1.0
value_grad_clip = 1.0
reward_clip = np.inf
distribution = 'beta'
device = 'cuda'

policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}

# LR Scheduler
# scheduler = {'type':'ExponentialLR', 'params':{'gamma':0.0001, 'last_epoch':-1}}
lr_scheduler = {'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.04, 'total_iters':1500, 'last_epoch':-1}}
# scheduler = {'type':'StepLR', 'params':{'step_size':1000, 'gamma':0.999, 'last_epoch':-1}}
# scheduler = {'type':'CosineAnnealingLR', 'params':{'T_max':10000, 'eta_min':0.0001, 'last_epoch':-1}}
# scheduler = None

# Entropy Scheduler
entropy_schedule = ScheduleWrapper({'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.1, 'total_iters':1500, 'last_epoch':-1}})

# Policy Clip Scheduler
# policy_clip_schedule = ScheduleWrapper({'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.1, 'total_iters':46875, 'last_epoch':-1}})
policy_clip_schedule = None

# KL Adapter
kl_adapter = AdaptiveKL(initial_beta=kl_coeff, target_kl=0.01, scale_up=1.2, scale_down=0.8, kl_tolerance_high=1.5, kl_tolerance_low=0.5)

# Render Settings
render_freq = 100

## WANDB ##
project_name = 'LunarLanderContinuous-v3'
run_name = None
# callbacks = [WandbCallback(project_name, run_name)]
callbacks = []

seed = 42
# env = gym.make(env_id)

save_dir = "LunarLanderContinuous_v3"


# Build policy model
layer_config = [
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticContinuousPolicy(env, layer_config, output_layer, optimizer_params=policy_optimizer, scheduler_params=lr_scheduler, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer, optimizer_params=value_optimizer, scheduler_params=lr_scheduler, device=device)
ppo = PPO(env, policy, value_function, discount=discount, gae_coefficient=gae_coefficient, policy_clip=policy_clip,
          entropy_coefficient=entropy_coeff, entropy_schedule=entropy_schedule, kl_coefficient=kl_coeff, kl_adapter=kl_adapter, normalize_advantages=normalize_advantages, normalize_values=normalize_values,
          value_normalizer_clip=norm_clip, policy_grad_clip=policy_grad_clip, value_grad_clip=value_grad_clip, reward_clip=reward_clip,
          callbacks=callbacks, save_dir=save_dir,device=device)
hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)
# ppo.test(10,"ppo_test", 1)


In [None]:
# 1. Register ALE envs with Gymnasium so "ale_py:ALE/Pong-v5" is recognized
# gym.register_envs(ale_py)

# def atari_wrappers(env):
#     # 2. Apply Atari-specific wrappers
#     #    - gray-scale & resize with AtariPreprocessing
#     #    - stack the last 4 frames with FrameStack
#     env = gym.wrappers.AtariPreprocessing(
#         env,
#         frame_skip=1,         # how many frames to skip each step (AtariPreprocessing can handle skipping)
#         grayscale_obs=True,   # get grayscale frames
#         scale_obs=True,       # optional: scale from 0..255 to 0..1
#         screen_size=84        # resize to 84×84
#     )
#     env = gym.wrappers.FrameStack(env, 4)
#     return env

## ENV PARAMS
env_id = "ALE/Pong-v5"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
print(f'env spec:{env_spec}')

def atari_factory(env):
    return gym.wrappers.AtariPreprocessing(env, frame_skip=1, scale_obs=True)

def framestack_factory(env):
    return gym.wrappers.FrameStackObservation(env, 4)

wrappers = [atari_factory, framestack_factory]

env = GymnasiumWrapper(gym_env.spec, wrappers)




In [None]:
env.env.envs[0].spec

In [None]:
## PARAMS ##

timesteps = 10_000_000
trajectory_length = 128
batch_size = 32
learning_epochs = 3
num_envs = 4
policy_lr = 2.5e-4
value_lr = 2.5e-4
policy_clip = 0.2
value_clip = 0.2
value_loss_coefficient = 0.5
discount = 0.99
gae_coefficient = 0.95
entropy_coeff = 0.01
kl_coeff = 0.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
policy_grad_clip = 0.5
value_grad_clip = 0.5
reward_clip = 1.0
distribution = 'categorical'
device = 'cuda'

policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}

# LR Scheduler
# scheduler = {'type':'ExponentialLR', 'params':{'gamma':0.0001, 'last_epoch':-1}}
lr_scheduler = {'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.04, 'total_iters':200_000, 'last_epoch':-1}}
# scheduler = {'type':'StepLR', 'params':{'step_size':1000, 'gamma':0.999, 'last_epoch':-1}}
# scheduler = {'type':'CosineAnnealingLR', 'params':{'T_max':10000, 'eta_min':0.0001, 'last_epoch':-1}}
# scheduler = None

# Entropy Scheduler
entropy_schedule = ScheduleWrapper({'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.1, 'total_iters':200_000, 'last_epoch':-1}})

# Policy Clip Scheduler
policy_clip_schedule = ScheduleWrapper({'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.5, 'total_iters':25_000, 'last_epoch':-1}})
# policy_clip_schedule = None

# Value Clip Scheduler
value_clip_schedule = ScheduleWrapper({'type':'linear', 'params':{'start_factor':1.0, 'end_factor':0.5, 'total_iters':25_000, 'last_epoch':-1}})
# value_clip_schedule = None

# KL Adapter
kl_adapter = AdaptiveKL(initial_beta=1.0, target_kl=0.01)
# kl_adapter = None

# Render Settings
render_freq = 20

## WANDB ##
project_name = 'Pong-v5'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42

save_dir = "Pong_v5"


# Build policy model
layer_config = [
    {'type': 'conv2d', 'params': {'out_channels': 32, 'kernel_size': 8, 'stride':4}},
    {'type': 'relu'},
    {'type': 'conv2d', 'params': {'out_channels': 64, 'kernel_size': 4, 'stride':2}},
    {'type': 'relu'},
    {'type': 'conv2d', 'params': {'out_channels': 64, 'kernel_size': 3, 'stride':1}},
    {'type': 'relu'},
    {'type': 'flatten'},
    {'type': 'dense', 'params': {'units': 512, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'}
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticDiscretePolicy(env, layer_config, output_layer, optimizer_params=policy_optimizer, scheduler_params=lr_scheduler, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer, optimizer_params=value_optimizer, scheduler_params=lr_scheduler, device=device)
ppo = PPO(env, policy, value_function, discount=discount, gae_coefficient=gae_coefficient, policy_clip=policy_clip, policy_clip_schedule=policy_clip_schedule,
          value_clip=value_clip, value_clip_schedule=value_clip_schedule, value_loss_coefficient=value_loss_coefficient,
          entropy_coefficient=entropy_coeff, entropy_schedule=entropy_schedule, kl_coefficient=kl_coeff, kl_adapter=kl_adapter, normalize_advantages=normalize_advantages, normalize_values=normalize_values,
          value_normalizer_clip=norm_clip, policy_grad_clip=policy_grad_clip, value_grad_clip=value_grad_clip, reward_clip=reward_clip,
          callbacks=callbacks, save_dir=save_dir,device=device)
# hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)

In [None]:
ppo.get_config()

In [None]:
train_info = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)

In [None]:
config_file_path = '/workspaces/RL_Agents/src/app/final_test/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

print(f'config:{config}')

ppo_load = PPO.load(config)

In [None]:
ppo_load.get_config()

CarRacing

In [None]:
## ENV PARAMS
env_id = "ALE/Pong-v5"
gym_env = gym.make(env_id)
env_spec = gym_env.spec.to_json()
print(f'env spec:{env_spec}')

def atari_factory(env):
    return gym.wrappers.AtariPreprocessing(env, frame_skip=1, scale_obs=True)

def framestack_factory(env):
    return gym.wrappers.FrameStackObservation(env, 4)

wrappers = [atari_factory, framestack_factory]

env = GymnasiumWrapper(gym_env.spec, wrappers)

## PARAMS ##

timesteps = 2_000_000
trajectory_length = 128
batch_size = 32
learning_epochs = 3
num_envs = 4
policy_lr = 2.5e-4
value_lr = 2.5e-4
policy_clip = 0.1
clip_decay = 0.9999
discount = 0.99
gae_coefficient = 0.95
entropy_coeff = 0.1
entropy_decay = 0.9999
kl_coeff = 0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
policy_grad_clip = 0.5
value_grad_clip = 0.5
reward_clip = np.inf
distribution = 'categorical'
device = 'cuda'

policy_optimizer = {'type':'Adam', 'params':{'lr':policy_lr}}
value_optimizer = {'type':'Adam', 'params':{'lr':value_lr}}

# scheduler = {'type':'ExponentialLR', 'params':{'gamma':0.0001, 'last_epoch':-1}}
lr_scheduler = {'type':'LinearLR', 'params':{'start_factor':1.0, 'end_factor':0.04, 'total_iters':375_000, 'last_epoch':-1}}
# scheduler = {'type':'StepLR', 'params':{'step_size':1000, 'gamma':0.999, 'last_epoch':-1}}
# scheduler = {'type':'CosineAnnealingLR', 'params':{'T_max':10000, 'eta_min':0.0001, 'last_epoch':-1}}
# scheduler = None

# Render Settings
render_freq = 20

## WANDB ##
project_name = 'Pong-v5'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42

save_dir = "Pong_v5"


# Build policy model
layer_config = [
    {'type': 'conv2d', 'params': {'out_channels': 32, 'kernel_size': 8, 'stride':4}},
    {'type': 'relu'},
    {'type': 'conv2d', 'params': {'out_channels': 64, 'kernel_size': 4, 'stride':2}},
    {'type': 'relu'},
    {'type': 'conv2d', 'params': {'out_channels': 64, 'kernel_size': 3, 'stride':1}},
    {'type': 'relu'},
    {'type': 'flatten'},
    {'type': 'dense', 'params': {'units': 512, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'relu'}
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticDiscretePolicy(env, layer_config, output_layer, optimizer_params=policy_optimizer, scheduler_params=lr_scheduler, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer, optimizer_params=value_optimizer, scheduler_params=lr_scheduler, device=device)
ppo = PPO(env, policy, value_function, discount=discount, gae_coefficient=gae_coefficient, policy_clip=policy_clip, clip_decay=clip_decay,
          entropy_coefficient=entropy_coeff, entropy_decay=entropy_decay, kl_coefficient=kl_coeff, normalize_advantages=normalize_advantages, normalize_values=normalize_values,
          value_normalizer_clip=norm_clip, policy_grad_clip=policy_grad_clip, value_grad_clip=value_grad_clip, reward_clip=reward_clip,
          callbacks=callbacks, save_dir=save_dir,device=device)
# hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)

Stochastic Discrete

In [None]:
## PARAMS ##
env_id = 'LunarLander-v3'

timesteps = 1_000_000
trajectory_length = 2000
batch_size = 64
learning_epochs = 10
num_envs = 4
policy_lr = 3e-4
value_lr = 2e-5
entropy_coeff = 0.001
kl_coeff = 0.0
normalize_advantages = True
normalize_values = False
norm_clip = np.inf
grad_clip = 40.0
reward_clip = 1.0
distribution = 'categorical'
device = 'cuda'

# Render Settings
render_freq = 100

## WANDB ##
project_name = 'LunarLander-v3'
run_name = None
callbacks = [WandbCallback(project_name, run_name)]
# callbacks = []

seed = 42
env = gym.make(env_id)

save_dir = "LunarLander-v3"


# Build policy model
layer_config = [
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
    {'type': 'dense', 'params': {'units': 64, 'kernel': 'default', 'kernel params':{}}},
    {'type': 'tanh'},
]
output_layer = {'type': 'dense', 'params': {'kernel': 'default', 'kernel params':{}}},
policy = StochasticDiscretePolicy(env, layer_config, output_layer, learning_rate=policy_lr, distribution=distribution, device=device)
# dense_layers = [(64,"tanh",{"default":{}}),(64,"tanh",{"default":{}})]
value_function = ValueModel(env, layer_config, output_layer, learning_rate=value_lr, device=device)
ppo = PPO(env, policy, value_function, discount=0.99, gae_coefficient=0.95, policy_clip=0.2, entropy_coefficient=entropy_coeff,
          kl_coefficient=kl_coeff, normalize_advantages=normalize_advantages, normalize_values=normalize_values, value_normalizer_clip=norm_clip, policy_grad_clip=grad_clip,
          reward_clip=reward_clip, callbacks=callbacks, save_dir=save_dir,device=device)
hybrid_train_info_2 = ppo.train(timesteps=timesteps, trajectory_length=trajectory_length, batch_size=batch_size, learning_epochs=learning_epochs, num_envs=num_envs, seed=seed, render_freq=render_freq)
# ppo.test(10,"ppo_test", 1)


In [None]:
env_id = 'CartPole-v1'
env = gym.make(env_id)

In [None]:
env.action_space.n

In [None]:
env_vec = ppo._initialize_env(100, 2, 42)

In [None]:
for env in env_vec.envs:
    print(env.spec)

In [None]:
env_id = 'ALE/SpaceInvaders-ram-v5'
env = gym.make(env_id)

In [None]:
env.action_space.n

In [None]:
config_file_path = '/workspaces/RL_Agents/src/app/walker2d/ppo/config.json'
with open(config_file_path, 'r') as file:
    config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(config)

In [None]:
walker = PPO.load(config)

In [None]:
humanoid.env

In [None]:
test_data = walker.test(10, render_freq=1)

In [None]:
test_data

In [None]:
np.arange(0.001, 0.101, 0.005)