In [18]:
import os
import sys

import gymnasium as gym
from torch.optim import Adam

sys.path.append(os.path.abspath(".."))

from rlib.algorithms.a2c import a2c
from rlib.algorithms.ppo import ppo
from rlib.algorithms.reinforce import reinforce
from rlib.common.evaluation import get_trajectory, validation
from rlib.common.policies import (
    DiscreteStochasticMlpPolicy,
    MlpCritic,
    StochasticMlpPolicy,
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [276]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

In [277]:
print(env.observation_space.sample(), env.observation_space.sample().shape)
print(env.action_space.sample(), env.action_space.sample().shape)

[-3.3906274e+00 -2.9158448e+38  3.0878833e-01 -2.2403954e+38] (4,)
0 ()


In [300]:
env = gym.make("Pendulum-v1")

In [301]:
print(env.observation_space.sample(), env.observation_space.sample().shape)
print(env.action_space.sample(), env.action_space.sample().shape)

[-0.915588   -0.09977326  3.9985425 ] (3,)
[-0.05857551] (1,)


In [198]:
env = gym.make("BipedalWalker-v3")

In [199]:
print(env.observation_space.sample(), env.observation_space.sample().shape)
print(env.action_space.sample(), env.action_space.sample().shape)

[-2.2708755   4.943143   -4.5599866   0.6844208  -0.45849904 -2.7709544
 -2.6829946  -0.06156341  1.4338952   1.7893121  -2.1389308   1.6121464
 -3.8557954   3.723489    0.7315291  -0.6467077  -0.6370253   0.7312314
 -0.89675224  0.8066579   0.9811636  -0.01427133 -0.7791947   0.497158  ] (24,)
[-0.16842724 -0.7188277  -0.68321896 -0.8027634 ] (4,)


In [179]:
env.action_space.sample().reshape(4, 1)

array([[ 0.9118681],
       [-0.150479 ],
       [ 0.7591273],
       [ 0.9307303]], dtype=float32)

In [305]:
discrete = False

obs_dim = env.observation_space.shape[0]

if discrete:
    action_dim = env.action_space.n
else:
    action_dim = env.action_space.shape[0]

print(obs_dim, action_dim)

3 1


### Reinforce

In [272]:
# policy = DiscreteStochasticMlpPolicy(input_size, output_size)
policy = StochasticMlpPolicy(input_size, output_size, action_scale=2)
optimizer = Adam(policy.parameters(), lr=1e-3)

In [273]:
traj = get_trajectory(env, policy, deterministic=False)

In [None]:
reinforce(env, policy, optimizer, total_timesteps=500_000)

In [264]:
policy.forward(env.observation_space.sample())

(tensor(-0.1113, grad_fn=<UnbindBackward0>),
 tensor(-0.9518, grad_fn=<UnbindBackward0>))

In [268]:
validation(env, policy, deterministic=True)

-1697.8311858462926

### A2C

In [277]:
# actor = DiscreteStochasticMlpPolicy(input_size, output_size)
actor = StochasticMlpPolicy(input_size, output_size, action_scale=2)
critic = MlpCritic(input_size)

actor_optimizer = Adam(actor.parameters(), lr=3e-4)
critic_optimizer = Adam(critic.parameters(), lr=1e-4)

In [278]:
a2c(env, actor, critic, actor_optimizer, critic_optimizer, total_timesteps=100_000)

steps_n: 2000
mean_trajectory_rewards: -1217.060302734375
mean_trajectory_length: 199.90000915527344
steps_n: 4000
mean_trajectory_rewards: -1151.2532958984375
mean_trajectory_length: 199.90000915527344
steps_n: 6000
mean_trajectory_rewards: -1255.719482421875
mean_trajectory_length: 199.90000915527344
steps_n: 8000
mean_trajectory_rewards: -991.3551025390625
mean_trajectory_length: 199.90000915527344
steps_n: 10000
mean_trajectory_rewards: -1137.3951416015625
mean_trajectory_length: 199.90000915527344
steps_n: 12000
mean_trajectory_rewards: -1250.375244140625
mean_trajectory_length: 199.90000915527344
steps_n: 14000
mean_trajectory_rewards: -1279.54296875
mean_trajectory_length: 199.90000915527344
steps_n: 16000
mean_trajectory_rewards: -1271.0977783203125
mean_trajectory_length: 199.90000915527344
steps_n: 18000
mean_trajectory_rewards: -1371.1456298828125
mean_trajectory_length: 199.90000915527344
steps_n: 20000
mean_trajectory_rewards: -1120.009033203125
mean_trajectory_length: 199

In [77]:
validation(env, actor, deterministic=True)

-1205.740195305502

### PPO

In [333]:
if discrete:
    actor = DiscreteStochasticMlpPolicy(obs_dim, action_dim)
else:
    actor = StochasticMlpPolicy(obs_dim, action_dim)

critic = MlpCritic(obs_dim)

actor_optimizer = Adam(actor.parameters(), lr=1e-4)
critic_optimizer = Adam(critic.parameters(), lr=5e-4)

In [None]:
ppo(env, actor, critic, actor_optimizer, critic_optimizer)

In [334]:
obs, _ = env.reset()

In [335]:
obs.shape

(3,)

In [337]:
actor.predict(env.observation_space.sample())

(array([0.91028386], dtype=float32),
 tensor([[-1.9151]], grad_fn=<ViewBackward0>))

In [336]:
torch.cat(
    [
        actor.predict(env.observation_space.sample())[1],
        actor.predict(env.observation_space.sample())[1],
    ], dim=0
).shape

torch.Size([2, 1])

In [338]:
from rlib.common.buffer import RolloutBuffer

In [339]:
rb = RolloutBuffer()

In [340]:
rb.collect_rollouts(env, actor, rollout_size=10)

In [341]:
data = rb.get_data()

In [342]:
data["observations"].shape

torch.Size([10, 3])

In [343]:
data["actions"].shape

torch.Size([10, 1])

In [344]:
actor.forward(data["observations"])[0].shape

torch.Size([10, 1])

In [345]:
import torch

In [346]:
observations = data["observations"]
actions = data["actions"]
old_log_probs = data["log_probs"]
epsilon = 0.1

In [347]:
old_log_probs.shape

torch.Size([10, 1])

In [348]:
loss = {}

_, new_log_probs = actor.get_action(observations, action=actions)

ratio = torch.exp(new_log_probs - old_log_probs.detach())
ratio_clipped = torch.clamp(ratio, 1 - epsilon, 1 + epsilon)

values = critic(observations)

targets = data["q_estimations"].reshape(-1, 1)
advantages = targets.detach() - values
print(targets.shape, values.shape, advantages.shape)
print(old_log_probs.shape, new_log_probs.shape, ratio.shape)

actor_loss_1 = ratio * advantages.detach()
actor_loss_2 = ratio_clipped * advantages.detach()

loss["actor"] = -(torch.min(actor_loss_1, actor_loss_2)).mean()
loss["critic"] = (advantages**2).mean()

torch.Size([10, 1]) torch.Size([10, 1]) torch.Size([10, 1])
torch.Size([10, 1]) torch.Size([10, 1]) torch.Size([10, 1])


In [13]:
validation(env, actor, deterministic=True)

-945.4736045591187

In [142]:
mu = torch.zeros((1, 2))
mu

tensor([[0., 0.]])

In [144]:
std = torch.ones((1, 2))
std

tensor([[1., 1.]])

In [145]:
from torch.distributions import Normal

In [146]:
dist = Normal(mu, std)
action = dist.sample()

In [148]:
action.shape

torch.Size([1, 2])

In [151]:
dist.log_prob(action).shape

torch.Size([1, 2])

In [153]:
dist.log_prob(action).sum(dim=1)

tensor([-1.9676])

In [180]:
from stable_baselines3 import PPO

In [191]:
env = gym.make("Pendulum-v1")
agent = PPO("MlpPolicy", env)

In [192]:
env.reset()

(array([ 0.03049863, -0.9995348 ,  0.8736682 ], dtype=float32), {})

In [193]:
action, _ = agent.predict(
    env.observation_space.sample(),
)

In [195]:
action

array([-1.9694774], dtype=float32)

In [194]:
env.step(action)

(array([ 0.02193137, -0.9997595 , -0.17140453], dtype=float32),
 -2.4527108869676995,
 False,
 False,
 {})