In [1]:
import torch
import numpy as np
import random

In [2]:
from src_new.reinforcement.lunar_lander import *

In [3]:
n_envs = 1
trajectory_lenght = 2048
batch_size = 64
num_epochs_per_traj = 10
total_iters = 250000
device = None
env_name = 'LunarLander-v2'

torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

In [4]:
global_actions = torch.randint(0, 4, size=(trajectory_lenght + 1, n_envs))

In [5]:
pred_logger = Logger()
pred_env = GymEnv(gym_name=env_name)
ac = GymActorCritic(input_dim=pred_env.state_dim, action_dim=pred_env.action_dim, device=device, debug_tens=global_actions)
# opt = torch.optim.Adam(ac.parameters(), lr=3e-4, eps=1e-5)
# ppo = PPO(actor_critic=ac, optimizer=opt, device=device, logger=logger)
pred_runner = Runner(environment=pred_env, actor_critic=ac,
                n_envs=n_envs, trajectory_lenght=trajectory_lenght)
pred_runner.reset(list(range(n_envs)))
pred_inference_logger = InferenceMetricsRunner(runner=pred_runner, logger=pred_logger)
pred_gae = GAE()
pred_normalizer = RewardNormalizer()
buffer = Buffer(gae=pred_gae, reward_normalizer=pred_normalizer, device=device)
pred_sampler = TrajectorySampler(pred_runner, buffer, num_epochs_per_traj=num_epochs_per_traj, batch_size=batch_size)


In [6]:
pred_sample = pred_sampler.sample()

In [7]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

In [8]:
from lunar_lander_debug_notebook import (
    Summaries,
    Normalize,
    Policy,
    AsArray,
    EnvRunner,
    GAE as GAE_OLD,
    TrajectorySampler as TrajectorySampler_OLD,
    NormalizeAdvantages
)

In [9]:
import gymnasium as gym
tgt_env = Normalize(Summaries(gym.make("LunarLander-v2", render_mode="rgb_array")), obs=False, ret=True)
tgt_env.reset(seed=0)

policy = Policy(model=ac)

In [10]:
tgt_runner_transforms = [AsArray(), GAE_OLD(policy)]
tgt_runner = EnvRunner(tgt_env, policy, trajectory_lenght, transforms=tgt_runner_transforms)
tgt_runner.reset(seed=0)
tgt_sampler_transforms = [NormalizeAdvantages()]
ac._debug_iter = 0
tgt_sampler = TrajectorySampler_OLD(tgt_runner, num_epochs=num_epochs_per_traj,
                            num_minibatches=32,
                            transforms=tgt_sampler_transforms)

In [11]:
tgt_sample = tgt_sampler.get_next()

In [12]:
for k in tgt_sample.keys():
    print(k, tgt_sample[k][:10])

actions [0 3 1 0 3 3 3 3 1 3]
log_probs [-1.41075122 -1.32259488 -1.43524849 -1.41357636 -1.31906962 -1.31453574
 -1.30926061 -1.30473089 -1.44244266 -1.30267704]
values [0.07636027 0.08223443 0.08993445 0.09443133 0.10070428 0.10852748
 0.11693294 0.12532517 0.13368753 0.13942483]
observations [[ 0.00570612  1.3990337   0.5779653  -0.5282997  -0.0066053  -0.13091765
   0.          0.        ]
 [ 0.01141253  1.3865713   0.57718486 -0.55392617 -0.01307467 -0.1293994
   0.          0.        ]
 [ 0.01719465  1.3735025   0.5866798  -0.5809347  -0.02144145 -0.16735117
   0.          0.        ]
 [ 0.02288361  1.3598258   0.57497466 -0.6079506  -0.02745714 -0.12032503
   0.          0.        ]
 [ 0.02857265  1.3455497   0.5749912  -0.6346211  -0.03347334 -0.12033501
   0.          0.        ]
 [ 0.03433599  1.3306733   0.5843054  -0.6613757  -0.04135226 -0.15759297
   0.          0.        ]
 [ 0.04019766  1.3152038   0.5965973  -0.6878521  -0.05168391 -0.20665224
   0.          0.        

In [13]:
for k in pred_sample.keys():
    print(k, pred_sample[k][:10])

advantages tensor([-6.0276, -1.8237, -1.0912, -1.0856, -0.8337, -0.4706, -0.1551,  0.0912,
         0.2931,  0.3341])
log_probs_chosen tensor([-1.4108, -1.3226, -1.4352, -1.4136, -1.3191, -1.3145, -1.3093, -1.3047,
        -1.4424, -1.3027])
values tensor([0.0764, 0.0822, 0.0899, 0.0944, 0.1007, 0.1085, 0.1169, 0.1253, 0.1337,
        0.1394])
states [<src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1b35d20>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f80bc0d22f0>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1b36e90>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1b36d10>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1b35db0>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1cefe80>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1cefc10>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1cefdc0>, <src_new.reinforcement.lunar_lander.GymState object at 0x7f7fe1cefd30>

In [32]:
ac._debug_iter = 0
tgt_traj = tgt_runner.get_next()
ac._debug_iter = 0
pred_traj = pred_runner.run()[0]

In [33]:
pred_normalizer([pred_traj])

In [35]:
advs = GAE()(pred_traj)
for i in range(trajectory_lenght):
    if abs(tgt_traj['values'][i] - pred_traj.values[i]) > 1e-7:
        print(i)
        break

In [82]:
tgt_traj['advantages'][:5]

array([-16.90639323,  -7.34869223,  -5.68349441,  -5.67070286,
        -5.09813637])

In [83]:

advs[:5]

[-16.906734613453963,
 -7.349055211198873,
 -5.683664658190591,
 -5.67084857285135,
 -5.098227355989695]

In [25]:
tgt_sampler = make_ppo_runner(tgt_env, policy)

In [26]:
tgt_sample = tgt_sampler.get_next()

In [27]:
pred_sample['actions_chosen']

tensor([0, 3, 2, 0, 1, 2, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 0, 0, 3, 0, 3, 3, 0, 1,
        3, 1, 3, 1, 3, 0, 3, 2, 1, 3, 1, 3, 1, 0, 2, 0, 0, 3, 2, 1, 0, 2, 2, 3,
        2, 0, 1, 3, 1, 0, 0, 2, 3, 2, 1, 0, 3, 0, 3, 1])

In [28]:
tgt_sample['actions']

array([0, 3, 1, 0, 3, 3, 3, 3, 1, 3, 1, 2, 0, 3, 2, 0, 0, 0, 2, 1, 2, 3,
       3, 2, 0, 1, 1, 1, 1, 0, 1, 0, 3, 0, 3, 1, 2, 3, 3, 0, 2, 3, 0, 1,
       3, 1, 3, 3, 2, 3, 0, 1, 1, 1, 3, 0, 3, 2, 0, 3, 3, 2, 3, 2])