In [1]:
import os
import json

import torch
import torch.nn as nn
from torch import optim
import numpy as np
import pandas as pd

import torch_utils
from torch import distributions

import gymnasium as gym
import gymnasium_robotics as gym_robo
import models
import cnn_models
import rl_agents
import rl_callbacks
import helper
import gym_helper
import wandb_support
import wandb
import gym_helper
import ale_py

# from mpi4py import MPI

In [None]:
import mpi4py

In [None]:
gym_robo.__version__

In [None]:
torch.cuda.empty_cache()
print(f"CUDA version: {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of CUDA devices: {torch.cuda.device_count()}")
print(torch.cuda.get_device_name(0))
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__CUDA Device Name:',torch.cuda.get_device_name(0))
print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)
print('Memory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# TEST

In [None]:
gym_robo.register_robotics_envs()

In [None]:
gym.envs.registration.registry

In [None]:
wandb.login(key='758ac5ba01e12a3df504d2db2fec8ba4f391f7e6')

In [None]:
env = gym.make('FetchPush-v2', max_episode_steps=100, render_mode='rgb_array')
env = gym.wrappers.RecordVideo(env, 'test/', episode_trigger=lambda i: i%1==0)

episodes = 10


for episode in range(episodes):
    done = False
    obs, _ = env.reset()
    while not done:
        obs, r, term, trunc, dict = env.step(env.action_space.sample())
        if term or trunc:
            done = True
env.close()

In [None]:
env = gym.make("FetchReach-v2")
env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())

# The following always has to hold:
assert reward == env.compute_reward(obs["achieved_goal"], obs["desired_goal"], info)
assert truncated == env.compute_truncated(obs["achieved_goal"], obs["desired_goal"], info)
assert terminated == env.compute_terminated(obs["achieved_goal"], obs["desired_goal"], info)

In [None]:
env.compute_reward()

In [None]:
env = gym.make('FetchPush-v2', render_mode='rgb_array')

In [None]:
if hasattr(env, "distance_threshold"):
    print('true')
else:
    print('false')

In [None]:
if env.get_wrapper_attr("distance_threshold"):
    print('true')

In [None]:
print(dir(env))


# DDPG

In [None]:
env = gym.make('LunarLanderContinuous-v2')

In [None]:
device='cuda'

In [None]:
# # build actor

# dense_layers = [
#     (
#         400,
#         "relu",
#         {
#             "variance scaling": {
#                 "scale": 1.0,
#                 "mode": "fan_in",
#                 "distribution": "uniform",
#             }
#         },
#     ),
#     (
#         300,
#         "relu",
#         {
#             "variance scaling": {
#                 "scale": 1.0,
#                 "mode": "fan_in",
#                 "distribution": "uniform",
#             }
#         },
#     )
# ]

# actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
#                           optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize_layers=True)

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "default": {
                
            }
        },
    ),
    (
        300,
        "relu",
        {
            "default": {
                
            }
        },
    )
]

# output_kernel = {"uniform":{"a":-0.003, "b":0.003}}
output_kernel = {"default":{}}

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, output_layer_kernel=output_kernel,
                          optimizer='Adam', optimizer_params={'weight_decay':0.0}, learning_rate=0.001,
                          normalize_layers=False, device=device)

In [None]:
actor

In [None]:
# # build critic

# state_layers = [
#     (
#         400,
#         "relu",
#         {
#             "variance scaling": {
#                 "scale": 1.0,
#                 "mode": "fan_in",
#                 "distribution": "uniform",
#             }
#         },
#     )
# ]

# merged_layers = [
#     (
#         300,
#         "relu",
#         {
#             "variance scaling": {
#                 "scale": 1.0,
#                 "mode": "fan_in",
#                 "distribution": "uniform",
#             }
#         },
#     )
# ]


# critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers,
#                             optimizer='Adam', optimizer_params={'weight_decay':0.01}, learning_rate=0.002, normalize_layers=True)

In [None]:
# build critic
# build actor

state_layers = []

merged_layers = [
    (
        400,
        "relu",
        {
            "default": {
                
            }
        },
    ),
    (
        300,
        "relu",
        {
            "default": {
                
            }
        },
    )
]

# output_kernel = {"uniform":{"a":-0.003, "b":0.003}}
output_kernel = {"default":{}}

critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers,
                            output_layer_kernel=output_kernel, optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},learning_rate=0.001, normalize_layers=False,
                            device=device)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, device=device)
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.1, device=device)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('LunarLander-v2-continuous')],
                            device=device)

In [None]:
ddpg_agent.critic_model

In [None]:
ddpg_agent.target_critic_model

In [None]:
ddpg_agent.train(500)

In [None]:
ddpg_agent.test(10, True, 1)

# Actor Critic

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', "kaiming normal"),
    (256, 'relu', "kaiming normal"),
    ]



In [None]:
policy_model = models.PolicyModel(env=env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001,)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
value_model = models.ValueModel(env, dense_layers=dense_layers, optimizer='Adam', learning_rate=0.001)

In [None]:
value_model

In [None]:
for params in value_model.parameters():
    print(params)

In [None]:
actor_critic = rl_agents.ActorCritic(env,
                                     policy_model,
                                     value_model,
                                     discount=0.99,
                                     policy_trace_decay=0.5,
                                     value_trace_decay=0.5,
                                     callbacks=[rl_callbacks.WandbCallback('CartPole-v1-Actor-Critic')])

In [None]:
actor_critic.train(200)

In [None]:
actor_critic.test(10, True, 1)

# REINFORCE

In [None]:
env = gym.make("CartPole-v1")

In [None]:
dense_layers = [
    (128, 'relu', {
                    "kaiming normal": {
                        "a":1.0,
                        "mode":'fan_in'
                    }
                },
    ),
    # (256, 'relu', {
    #                 "kaiming_normal": {
    #                     "a":0.0,
    #                     "mode":'fan_in'
    #                 }
    #             },
    # )
    ]

In [None]:
dense_layers = [(128, 'relu', "kaiming normal")]

In [None]:
value_model = models.ValueModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in value_model.parameters():
    print(param)

In [None]:
policy_model = models.PolicyModel(env, dense_layers, 'Adam', 0.001)

In [None]:
for param in policy_model.parameters():
    print(param)

In [None]:
reinforce = rl_agents.Reinforce(env, policy_model, value_model, 0.99, [rl_callbacks.WandbCallback('CartPole-v0_REINFORCE', chkpt_freq=100)])

In [None]:
reinforce.train(200, True, 50)

In [None]:
reinforce.test(10, True, 1)

# DDPG w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

In [None]:
cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
cnn

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env, cnn_model=cnn, dense_layers=dense_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        64,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=cnn, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.0001, normalize=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape=(1,))
noise = helper.OUNoise(shape=env.action_space.shape, mean=0.0, theta=0.15, sigma=0.01, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(
    env,
    actor,
    critic,
    discount=0.98,
    tau=0.05,
    action_epsilon=0.2,
    replay_buffer=replay_buffer,
    batch_size=128,
    noise=noise,
    callbacks=[rl_callbacks.WandbCallback("CarRacing-v2")]
)

In [None]:
ddpg_agent.train(1000, True, 10)

In [None]:
wandb.finish()

In [None]:
wandb.login()

# HER

In [None]:
env = gym.make("Reacher-v4")

In [None]:
_,_ = env.reset()

In [None]:
achieved_goal = gym_helper.reacher_achieved_goal(env)
action = env.action_space.sample()
env.step(action)
print(f'observation: {env.get_wrapper_attr("_get_obs")()}')
print(f'distance to goal: {env.get_wrapper_attr("_get_obs")()[8::]}')
print(f'fingertip: {env.get_wrapper_attr("get_body_com")("fingertip")}')
print(f'target: {env.get_wrapper_attr("get_body_com")("target")}')

In [None]:
next_achieved_goal = env.get_wrapper_attr("_get_obs")()[8::]
desired_goal = [0.0, 0.0, 0.0]

In [None]:
reward_func(env, action, achieved_goal, next_achieved_goal, desired_goal, 0.05)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=None,
                          dense_layers=dense_layers,
                          goal_shape=(3,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.0001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=None,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(3,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.0001,
                            normalize=False)

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Reacher-v4')])

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=0.001,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.train(10, 50, 16, 40, True, 1000)

In [None]:
wandb.finish()

In [None]:
her.test(10, True, 1)

In [None]:
her.save()

In [None]:
her.agent.goal_normalizer.running_std

In [None]:
loaded_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
loaded_her.agent.replay_buffer.sample(10)

In [None]:
loaded_her.agent.state_normalizer.running_cnt

In [None]:
loaded_her.get_config()

In [None]:
loaded_her.test(10, True, 1)

In [None]:
10e4

# HER w/CNN

In [None]:
env = gym.make('CarRacing-v2')

In [None]:
_,_ = env.reset()

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
desired_goal(env).shape

In [None]:
cnn_layers = [
    # {
    #     "batchnorm":
    #     {
    #         "num_features":3
    #     }
    # },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 7,
            "stride": 3,
            "padding": 'valid',
            "bias": False
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 5,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
    {
        "relu":
        {

        }
    },
    {
        "batchnorm":
        {
            "num_features":32
        }
    },
    {
        "conv":
        {
            "out_channels": 32,
            "kernel_size": 3,
            "stride": 3,
            "padding": 'valid',
            "bias": False,
        }
    },
]

cnn = cnn_models.CNN(cnn_layers, env)

In [None]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
]

actor = models.ActorModel(env,
                          cnn_model=cnn,
                          dense_layers=dense_layers,
                          goal_shape=(1,),
                          optimizer="Adam",
                          optimizer_params={'weight_decay':0.0},
                          learning_rate=0.001, normalize=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        256,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]


critic = models.CriticModel(env=env,
                            cnn_model=cnn,
                            state_layers=state_layers,
                            merged_layers=merged_layers,
                            goal_shape=(1,),
                            optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},
                            learning_rate=0.001,
                            normalize=False)

In [None]:
critic

In [None]:
goal_shape = desired_goal_func(env).shape
replay_buffer = helper.ReplayBuffer(env, 100000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape,
#                        mean=0.0,
#                        theta=0.05,
#                        sigma=0.15,
#                        dt=1.0, device='cuda')

noise=helper.NormalNoise(shape=env.action_space.shape,
                         mean = 0.0,
                         stddev=0.05,
                         )

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('CarRacing-v2')])

In [None]:
ddpg_agent.actor_model

In [None]:
her = rl_agents.HER(ddpg_agent,
                    strategy='future',
                    num_goals=4,
                    tolerance=1,
                    desired_goal=desired_goal_func,
                    achieved_goal=achieved_goal_func,
                    reward_fn=reward_func)

In [None]:
her.agent.actor_model

In [None]:
her.train(num_epochs=20,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=20
        )

In [None]:
her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/models/her")

In [None]:
wandb.finish()

In [None]:
# reset environment
state, _ = her.agent.env.reset()
# instantiate empty lists to store current episode trajectory
states, actions, next_states, dones, state_achieved_goals, \
next_state_achieved_goals, desired_goals = [], [], [], [], [], [], []
# set desired goal
desired_goal = her.desired_goal_func(her.agent.env)
# set achieved goal
state_achieved_goal = her.achieved_goal_func(her.agent.env)
# add initial state and goals to local normalizer stats
her.state_normalizer.update_local_stats(state)
her.goal_normalizer.update_local_stats(desired_goal)
her.goal_normalizer.update_local_stats(state_achieved_goal)
# set done flag
done = False
# reset episode reward to 0
episode_reward = 0
# reset steps counter for the episode
episode_steps = 0

while not done:
    # get normalized values for state and desired goal
    state_norm = her.state_normalizer.normalize(state)
    desired_goal_norm = her.goal_normalizer.normalize(desired_goal)
    # get action
    action = her.agent.get_action(state_norm, desired_goal_norm, grad=False)
    # take action
    next_state, reward, term, trunc, _ = her.agent.env.step(action)
    # get next state achieved goal
    next_state_achieved_goal = her.achieved_goal_func(her.agent.env)
    # add next state and next state achieved goal to normalizers
    her.state_normalizer.update_local_stats(next_state)
    her.goal_normalizer.update_local_stats(next_state_achieved_goal)
    # store trajectory in replay buffer (non normalized!)
    her.agent.replay_buffer.add(state, action, reward, next_state, done,\
                                    state_achieved_goal, next_state_achieved_goal, desired_goal)
    
    # append step state, action, next state, and goals to respective lists
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    dones.append(done)
    state_achieved_goals.append(state_achieved_goal)
    next_state_achieved_goals.append(next_state_achieved_goal)
    desired_goals.append(desired_goal)

    # add to episode reward and increment steps counter
    episode_reward += reward
    episode_steps += 1
    # update state and state achieved goal
    state = next_state
    state_achieved_goal = next_state_achieved_goal
    # update done flag
    if term or trunc:
        done = True

In [None]:
# package episode states, actions, next states, and goals into trajectory tuple
trajectory = (states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)

In [None]:
states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals = trajectory

In [None]:
for idx, (s, a, ns, d, sag, nsag, dg) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):
    print(f'a={a}, d={d}, sag={sag}, nsag={nsag}, dg={dg}')

In [None]:
strategy = "future"
num_goals = 4

# loop over each step in the trajectory to set new achieved goals, calculate new reward, and save to replay buffer
for idx, (state, action, next_state, done, state_achieved_goal, next_state_achieved_goal, desired_goal) in enumerate(zip(states, actions, next_states, dones, state_achieved_goals, next_state_achieved_goals, desired_goals)):

    if strategy == "final":
        new_desired_goal = next_state_achieved_goals[-1]
        new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
        print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
        her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)

    if strategy == 'future':
        for i in range(num_goals):
            if idx + i + 1 >= len(states):
                break
            goal_idx = np.random.randint(idx + 1, len(states))
            new_desired_goal = next_state_achieved_goals[goal_idx]
            new_reward = her.reward_fn(state_achieved_goal, next_state_achieved_goal, new_desired_goal)
            print(f'transition: action={action}, reward={new_reward}, done={done}, state_achieved_goal={state_achieved_goal}, next_state_achieved_goal={next_state_achieved_goal}, desired_goal={new_desired_goal}')
            her.agent.replay_buffer.add(state, action, new_reward, next_state, done, state_achieved_goal, next_state_achieved_goal, new_desired_goal)
    

    


In [None]:
s, a, r, ns, d, sag, nsag, dg = her.agent.replay_buffer.sample(100)

In [None]:
for i in range(100):
    print(f'{i}: a={a[i]}, r={r[i]}, d={d[i]}, sag={sag[i]}, nsag={nsag[i]}, dg={dg[i]} ')

# HER Pendulum

In [None]:
env = gym.make('Pendulum-v1')

In [None]:
# build actor

dense_layers = [
    (
        400,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    ),
    (
        300,
        "relu",
        {
            "variance scaling": {
                "scale": 1.0,
                "mode": "fan_in",
                "distribution": "uniform",
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, optimizer='Adam',
                          optimizer_params={'weight_decay':0.01}, learning_rate=0.001, normalize=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.001, normalize=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 100000, (3,))
noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.99,
                            tau=0.005,
                            replay_buffer=replay_buffer,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback('Pendulum-v1')])

In [None]:
def desired_goal_func(env):
    return np.array([0.0, 0.0, 0.0])

def achieved_goal_func(env):
    return env.get_wrapper_attr('_get_obs')()

def reward_func(env):
    pass

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='none',
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=10.0
)

In [None]:
her.agent.critic_model

In [None]:
her.agent.target_critic_model

In [None]:
her.train(1,1,100,1)

In [None]:
wandb.finish()

In [None]:
state = env.observation_space.sample()
state

In [None]:
her.agent.state_normalizer.normalize(state)

In [None]:
goal = her.desired_goal_func(her.agent.env)
goal

In [None]:
her.agent.goal_normalizer.normalize(goal)

In [None]:
def remove_renders(folder_path):
    # Iterate over the files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file has a .mp4 or .meta.json extension
        if filename.endswith(".mp4") or filename.endswith(".meta.json"):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            # Remove the file
            os.remove(file_path)

In [None]:
remove_renders("/workspaces/RL_Agents/pytorch/src/app/assets/models/ddpg/renders/training")

# HER Fetch-Reach (Robotics)

In [None]:
env = gym.make("FetchReach-v2", max_episode_steps=50)

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
achieved_goal_func(env)

In [None]:
env.get_wrapper_attr("_get_obs")()

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
goal_shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
actor

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
critic

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.2,
                            replay_buffer=replay_buffer,
                            batch_size=256,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchReach-v2")])

In [None]:
ddpg_agent.critic_model

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='future',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

In [None]:
states, action, rewards, next_states, dones, achieved_goals, next_achieved_goals, desired_goals = her.agent.replay_buffer.sample(2)

In [None]:
desired_goals

In [None]:
her.agent.env.get_wrapper_attr("distance_threshold")

In [None]:
# get success
her.agent.env.get_wrapper_attr("_is_success")(achieved_goal_func(her.agent.env), desired_goal_func(her.agent.env))

In [None]:
her.agent.env.get_wrapper_attr("goal_distance")(next_state_achieved_goal, desired_goal, None)

In [None]:
pusher_her = rl_agents.HER.load("/workspaces/RL_Agents/pytorch/src/app/assets/models/her")

In [None]:
pusher_her.agent.env.reset()

In [None]:
pusher_her.get_config()

In [None]:
wandb.finish()

In [None]:
np.linalg.norm(pusher_her.agent.env.get_wrapper_attr("get_body_com")("goal") - pusher_her.agent.env.get_wrapper_attr("get_body_com")("object"))

In [None]:
pusher_her.agent.replay_buffer.get_config()

In [None]:

pusher_her.agent.replay_buffer.desired_goals

In [None]:
## TEST ENV
env = gym.make("Pusher-v5", render_mode="rgb_array")

In [None]:
env = gym.wrappers.RecordVideo(
                    env,
                    "/renders/training",
                    episode_trigger=lambda x: True,
                )


In [None]:
state, _ = env.reset()

for i in range(1000):
# take action
    next_state, reward, term, trunc, _ = env.step(env.action_space.sample())
env.close()

# HER Fetch Push (Robitics)

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train(num_epochs=50,
          num_cycles=50,
          num_episodes=16,
          num_updates=40,
          render=True,
          render_freq=1000)

# TESTING MULTITHREADING

In [None]:
env = gym.make('FetchPush-v2')

In [None]:
desired_goal_func, achieved_goal_func, reward_func = gym_helper.get_her_goal_functions(env)

In [None]:
# reset env state
env.reset()

In [None]:
goal_shape = desired_goal_func(env).shape

In [None]:
# build actor

dense_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    )
]

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, goal_shape=goal_shape, optimizer='Adam',
                          optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
# build critic

state_layers = [
    
]

merged_layers = [
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
               
            }
        },
    ),
    (
        64,
        "relu",
        {
            "kaiming uniform": {
                
            }
        },
    ),
]


critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers, goal_shape=goal_shape, optimizer="Adam", optimizer_params={'weight_decay':0.0}, learning_rate=0.00001, normalize_layers=False)

In [None]:
replay_buffer = helper.ReplayBuffer(env, 1000000, goal_shape)
# noise = helper.OUNoise(shape=env.action_space.shape, dt=1.0, device='cuda')
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.05)

In [None]:
ddpg_agent = rl_agents.DDPG(env=env,
                            actor_model=actor,
                            critic_model=critic,
                            discount=0.98,
                            tau=0.05,
                            action_epsilon=0.3,
                            replay_buffer=replay_buffer,
                            batch_size=128,
                            noise=noise,
                            callbacks=[rl_callbacks.WandbCallback("FetchPush-v2")],
                            save_dir="fetch_push/models/ddpg/"
                            )

In [None]:
her = rl_agents.HER(
    agent=ddpg_agent,
    strategy='final',
    num_workers=4,
    tolerance=0.05,
    num_goals=4,
    desired_goal=desired_goal_func,
    achieved_goal=achieved_goal_func,
    reward_fn=reward_func,
    normalizer_clip=5.0,
    save_dir="fetch_push/models/her/"
)

In [None]:
her.train()

# TESTING

In [None]:
# load config
config_path = "/workspaces/RL_Agents/pytorch/src/app/HER_Test/her/config.json"
with open(config_path, 'r') as file:
    config = json.load(file)

In [None]:
config

In [None]:
agent = rl_agents.HER.load(config)

In [None]:
for callback in agent.agent.callbacks:
    print(callback._sweep)

# Co Occurence

In [None]:
import subprocess

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/wandb_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    wandb_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(wandb_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'assets/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Save the updated configuration to a train config file
os.makedirs('sweep', exist_ok=True)
train_config_path = os.path.join(os.getcwd(), 'sweep/train_config.json')
with open(train_config_path, 'w') as f:
    json.dump(sweep_config, f)

# Save and Set the sweep config path
sweep_config_path = os.path.join(os.getcwd(), 'sweep/sweep_config.json')
with open(sweep_config_path, 'w') as f:
    json.dump(wandb_config, f)

In [None]:
command = ['python', 'sweep.py']

# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

subprocess.Popen(command)

In [None]:
# Set the environment variable
os.environ['WANDB_DISABLE_SERVICE'] = 'true'

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/sweep_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    sweep_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(sweep_config)

In [None]:
# Define the path to your JSON configuration file
config_file_path = 'sweep/train_config.json'

# Read the JSON configuration file
with open(config_file_path, 'r') as file:
    train_config = json.load(file)

# Print the configuration to verify it has been loaded correctly
print(train_config)

In [None]:
from dash_callbacks import run_agent

sweep_id = wandb.sweep(sweep=sweep_config, project=sweep_config["project"])
# loop over num wandb agents
num_agents = 2
for agent in range(num_agents):
    
)

In [None]:
sweep_config

In [None]:
env = gym.make("FetchReach-v2")

In [None]:
type(env)

In [None]:
env_spec = env.spec.to_json()

In [None]:
env_spec

In [None]:
type(env_spec)

In [None]:
env = gym.make(gym.envs.registration.EnvSpec.from_json(env_spec))

In [None]:
env.spec

In [None]:
def load_env_spec(env_spec_dict):
    # Create a new EnvSpec instance using the dictionary
    env_spec = gym.envs.registration.EnvSpec(**env_spec_dict)
    return env_spec

In [None]:
load_env_spec(env_spec)

In [None]:
config_path = 'sweep/agent_config_58.json'

with open(config_path, 'r') as file:
    agent_config = json.load(file)

her = rl_agents.HER.load(agent_config)

In [None]:
her.get_config()

# TD3

In [78]:
# env = gym.make('LunarLanderContinuous-v3')
# env = gym.make("BipedalWalker-v3")
env = gym.make("Pendulum-v1")

In [79]:
env = gym.make(env.spec, render_mode="rgb_array")
save_dir = "/workspaces/RL_Agents/pytorch/src/app/td3_test/td3"
os.makedirs(save_dir + "/renders/training", exist_ok=True)
env = gym.wrappers.RecordVideo(
    env,
    save_dir + "/renders/training",
    episode_trigger=lambda episode_id: (episode_id+1) % render_freq == 0,
)


[33mWARN: Overwriting existing videos at /workspaces/RL_Agents/pytorch/src/app/td3_test/td3/renders/training folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)[0m



In [81]:
env.spec

EnvSpec(id='Pendulum-v1', entry_point='gymnasium.envs.classic_control.pendulum:PendulumEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=200, order_enforce=True, disable_env_checker=False, kwargs={'render_mode': 'rgb_array'}, namespace=None, name='Pendulum', version=1, additional_wrappers=(WrapperSpec(name='RecordVideo', entry_point='gymnasium.wrappers.rendering:RecordVideo', kwargs={'video_folder': '/workspaces/RL_Agents/pytorch/src/app/td3_test/td3/renders/training', 'episode_trigger': <function <lambda> at 0x7fa1a0c68dc0>, 'step_trigger': None, 'video_length': 0, 'name_prefix': 'rl-video', 'disable_logger': True}),), vector_entry_point=None)

In [74]:
env.spec.to_json()

'{"id": "Pendulum-v1", "entry_point": "gymnasium.envs.classic_control.pendulum:PendulumEnv", "reward_threshold": null, "nondeterministic": false, "max_episode_steps": 200, "order_enforce": true, "disable_env_checker": false, "kwargs": {}, "additional_wrappers": [], "vector_entry_point": null}'

In [60]:
json_spec

'{"id": "Pendulum-v1", "entry_point": "gymnasium.envs.classic_control.pendulum:PendulumEnv", "reward_threshold": null, "nondeterministic": false, "max_episode_steps": 200, "order_enforce": true, "disable_env_checker": false, "kwargs": {}, "additional_wrappers": [], "vector_entry_point": null}'

In [44]:
from gymnasium.envs.registration import EnvSpec

def serialize_env_spec(env_spec):
    """Extracts and serializes the relevant parts of the environment specification."""
    env_spec_dict = {
        "id": env_spec.id,
        "entry_point": env_spec.entry_point,
        "reward_threshold": env_spec.reward_threshold,
        "nondeterministic": env_spec.nondeterministic,
        "max_episode_steps": env_spec.max_episode_steps,
        "order_enforce": env_spec.order_enforce,
        "disable_env_checker": env_spec.disable_env_checker,
        "kwargs": env_spec.kwargs,
        "additional_wrappers": env_spec.additional_wrappers,
        "vector_entry_point": env_spec.vector_entry_point,
    }
    return env_spec_dict

In [58]:
from_json_spec = gym.envs.registration.EnvSpec.from_json(json_spec)

In [59]:
from_json_spec

EnvSpec(id='Pendulum-v1', entry_point='gymnasium.envs.classic_control.pendulum:PendulumEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=200, order_enforce=True, disable_env_checker=False, kwargs={}, namespace=None, name='Pendulum', version=1, additional_wrappers=(), vector_entry_point=None)

In [52]:
for key,val in env_spec.items():
    print(f'{key}:{val};{type(val)}')

id:Pendulum-v1;<class 'str'>
entry_point:gymnasium.envs.classic_control.pendulum:PendulumEnv;<class 'str'>
reward_threshold:None;<class 'NoneType'>
nondeterministic:False;<class 'bool'>
max_episode_steps:200;<class 'int'>
order_enforce:True;<class 'bool'>
disable_env_checker:False;<class 'bool'>
kwargs:{};<class 'dict'>
additional_wrappers:();<class 'tuple'>
vector_entry_point:None;<class 'NoneType'>


In [48]:
gym.envs.registration.EnvSpec.from_json(env_spec)

TypeError: the JSON object must be str, bytes or bytearray, not dict

In [47]:
env = gym.make(env_spec)

AssertionError: 

In [30]:
device='cuda'

In [31]:
# build actor

dense_layers = [
    (
        256,
        "relu",
        {
            "default": {
                
            }
        },
    ),
    (
        256,
        "relu",
        {
            "default": {
                
            }
        },
    )
]

# output_kernel = {"uniform":{"a":-0.003, "b":0.003}}
output_kernel = {"default":{}}

actor = models.ActorModel(env, cnn_model=None, dense_layers=dense_layers, output_layer_kernel=output_kernel,
                          optimizer='Adam', optimizer_params={'weight_decay':0.0}, learning_rate=3e-4,
                          normalize_layers=False, device=device)

In [None]:
actor

In [None]:
for param in actor.parameters():
    print(param)

In [32]:
# build critic

state_layers = []

merged_layers = [
    (
        256,
        "relu",
        {
            "default": {
                
            }
        },
    ),
    (
        256,
        "relu",
        {
            "default": {
                
            }
        },
    )
]

# output_kernel = {"uniform":{"a":-0.003, "b":0.003}}
output_kernel = {"default":{}}

critic = models.CriticModel(env=env, cnn_model=None, state_layers=state_layers, merged_layers=merged_layers,
                            output_layer_kernel=output_kernel, optimizer="Adam",
                            optimizer_params={'weight_decay':0.0},learning_rate=3e-4, normalize_layers=False,
                            device=device)

In [None]:
critic

In [None]:
for params in critic.parameters():
    print(params)

In [33]:
replay_buffer = helper.ReplayBuffer(env, 1000000, device=device)
noise = helper.NormalNoise(shape=env.action_space.shape, mean=0.0, stddev=0.1, device=device)

In [34]:
td3 = rl_agents.TD3(
    env=env,
    actor_model=actor,
    critic_model=critic,
    replay_buffer=replay_buffer,
    batch_size=256,
    noise=noise,
    actor_update_delay=2,
    callbacks=[rl_callbacks.WandbCallback("BipedalWalker-v3")],
    use_mpi=False,
    device=device
)

In [None]:
for params in td3.critic_model_a.parameters():
    print(params)

In [None]:
for params in td3.critic_model_b.parameters():
    print(params)

In [None]:
td3.get_config()

In [35]:
td3.train(num_episodes=100)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
[34m[1mwandb[0m: Adding directory to artifact (./models/td3)... Done. 0.0s


episode 1, score -71.89686065415344, avg_score -71.89686065415344, episode_time 10.32s, avg_episode_time 10.32s, avg_step_time 0.015053s, avg_learn_time 0.013915s, avg_steps_per_episode 1600.00
episode 2, score -116.3688738818268, avg_score -94.13286726799012, episode_time 0.83s, avg_episode_time 5.57s, avg_step_time 0.014766s, avg_learn_time 0.013542s, avg_steps_per_episode 827.50
episode 3, score -113.92613937068731, avg_score -100.73062463555584, episode_time 0.71s, avg_episode_time 3.95s, avg_step_time 0.014797s, avg_learn_time 0.013563s, avg_steps_per_episode 567.00
episode 4, score -108.52828200667413, avg_score -102.68003897833542, episode_time 1.55s, avg_episode_time 3.35s, avg_step_time 0.014134s, avg_learn_time 0.013056s, avg_steps_per_episode 452.00
episode 5, score -113.49380857307712, avg_score -104.84279289728377, episode_time 2.16s, avg_episode_time 3.11s, avg_step_time 0.015202s, avg_learn_time 0.014061s, avg_steps_per_episode 389.40
episode 6, score -121.84719833813297

0,1
action_0,▆▃▆█▆▅▂▆▆▇▆▄▇▆▇▂▄██▇▄▄▄██▂▄▆▅█▃▅▇▃▁▁▃▄▂▇
action_1,▄▆█▇▃▇█▄▇██████▁█▇██▂█▇█▇▂▁▁▁▁▁▁▂▁▇█▆▃▂▁
action_2,▄▅▆█▅▇▁▅▅▄▅▄▄▅▇▁▇▃▃█▄▄▇▃▂▆███████▇██▁▃▂▁
action_3,▃▃▅▃▅▇▂▆▇▅█▃▅▆▃▇▇▇█▂█████▆▂▇▅▄▃▅▆▅▁▁▆▆▇█
actor_loss,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▃▃▅▃▆▃▅▅▅█▆▆▆▆▆▇▅▇▆▅▇▅▆▅▇▇
actor_predictions,▆█▃▃▃▃▄▅▆▄▄▄▃▃▃▃▃▄▃▃▃▃▃▃▂▂▂▃▂▂▂▁▂▂▂▁▁▁▂▁
avg_reward,█▄▃▂▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
best,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
critic_loss,▁▁▁▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▂▂▃▂▃▂▂█▂▂▃▂▃▂▃▃▃▂▂
critic_predictions,███████████▇▇▇▆▆▄▆▃▆▄▄▄▁▃▃▃▃▃▂▄▂▃▄▂▄▃▄▂▂

0,1
action_0,-0.45116
action_1,-0.49909
action_2,0.06091
action_3,-0.65854
actor_loss,37.8564
actor_predictions,-0.07779
avg_reward,-118.13261
best,False
critic_loss,106.45586
critic_predictions,-37.8564


In [21]:
any(td3.replay_buffer.dones) == 1

False

In [17]:
td3.save('src/app/models/td3')

In [67]:
# load config
with open('/workspaces/RL_Agents/pytorch/src/app/td3_test/td3/config.json', 'r') as file:
    config = json.load(file)
td3 = rl_agents.TD3.load(config)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

In [71]:
for key,val in config.items():
    print(f'{key}:{type(val)};{val}')

agent_type:<class 'str'>;TD3
env:<class 'str'>;{"id": "Pendulum-v1", "entry_point": "gymnasium.envs.classic_control.pendulum:PendulumEnv", "reward_threshold": null, "nondeterministic": false, "max_episode_steps": 200, "order_enforce": true, "disable_env_checker": false, "kwargs": {}, "additional_wrappers": [], "vector_entry_point": null}
actor_model:<class 'dict'>;{'env': 'Pendulum-v1', 'cnn_model': None, 'num_layers': 4, 'dense_layers': [[256, 'relu', {'default': {}}], [256, 'relu', {'default': {}}]], 'output_layer_kernel': {'default': {}}, 'goal_shape': None, 'optimizer': 'Adam', 'optimizer_params': {'weight_decay': 0}, 'learning_rate': 0.0001, 'normalize_layers': False}
critic_model:<class 'dict'>;{'env': 'Pendulum-v1', 'cnn_model': None, 'num_layers': 4, 'state_layers': [], 'merged_layers': [[256, 'relu', {'default': {}}], [256, 'relu', {'default': {}}]], 'output_layer_kernel': {'default': {}}, 'goal_shape': None, 'optimizer': 'Adam', 'optimizer_params': {'weight_decay': 0}, 'learn

In [65]:
td3.get_config()

{'agent_type': 'TD3',
 'env': '{"id": "Pendulum-v1", "entry_point": "gymnasium.envs.classic_control.pendulum:PendulumEnv", "reward_threshold": null, "nondeterministic": false, "max_episode_steps": 200, "order_enforce": true, "disable_env_checker": false, "kwargs": {}, "additional_wrappers": [], "vector_entry_point": null}',
 'actor_model': {'env': 'Pendulum-v1',
  'cnn_model': None,
  'num_layers': 4,
  'dense_layers': [[256, 'relu', {'default': {}}],
   [256, 'relu', {'default': {}}]],
  'output_layer_kernel': {'default': {}},
  'goal_shape': None,
  'optimizer': 'Adam',
  'optimizer_params': {'weight_decay': 0},
  'learning_rate': 0.0001,
  'normalize_layers': False},
 'critic_model': {'env': 'Pendulum-v1',
  'cnn_model': None,
  'num_layers': 4,
  'state_layers': [],
  'merged_layers': [[256, 'relu', {'default': {}}],
   [256, 'relu', {'default': {}}]],
  'output_layer_kernel': {'default': {}},
  'goal_shape': None,
  'optimizer': 'Adam',
  'optimizer_params': {'weight_decay': 0},
 

In [66]:
td3.save()

In [37]:
td3.test(10, True, 1, td3.save_dir)

new save dir: models/td3/



[33mWARN: Overwriting existing videos at /workspaces/RL_Agents/pytorch/src/app/models/td3/renders/testing folder (try specifying a different `video_folder` for the `RecordVideo` wrapper if this is not desired)[0m



episode 1, score -132.20585432354616, avg_score -132.20585432354616
episode 2, score -128.11128140719205, avg_score -130.1585678653691
episode 3, score -112.35276252416146, avg_score -124.22329941829987
episode 4, score -114.97414180118714, avg_score -121.9110100140217
episode 5, score -125.37473318914644, avg_score -122.60375464904664
episode 6, score -111.36876013943242, avg_score -120.73125556411094
episode 7, score -130.2563140302084, avg_score -122.09197820212486
episode 8, score -125.88898792958807, avg_score -122.56660441805778
episode 9, score -127.68787832038116, avg_score -123.13563485164926
episode 10, score -121.85973918368853, avg_score -123.00804528485318


0,1
avg_reward,▁▂▆▇▇█▇▇▇▇
episode_reward,▁▂█▇▃█▂▃▃▄

0,1
avg_reward,-123.00805
episode_reward,-121.85974
