# Recurrent AC Model



In [1]:
import gymnasium as gym
import torch as t 
import torch.optim as optim
import numpy as np

from src.models.trajectory_lstm import TrajectoryLSTM
from src.ppo.agent import PPOAgent, PPOScheduler
from src.ppo.memory import Memory
from src.config import EnvironmentConfig, LSTMModelConfig, OnlineTrainConfig
from src.ppo.utils import get_obs_shape 
from src.environments.environments import make_env
from src.utils import DictList
from src.ppo.agent import LSTMPPOAgent

# now in order instantiate the class we need: the config files and the environment.

environment_config = EnvironmentConfig()
lstm_config = LSTMModelConfig(environment_config)
online_config = OnlineTrainConfig()
run_name = "dev"

envs = gym.vector.SyncVectorEnv(
    [make_env(
        env_id=environment_config.env_id,
        seed=environment_config.seed + i,
        idx=i,
        capture_video=environment_config.capture_video,
        run_name=run_name,
        max_steps=environment_config.max_steps,
        fully_observed=environment_config.fully_observed,
        flat_one_hot=environment_config.one_hot_obs,
        agent_view_size=environment_config.view_size,
        render_mode="rgb_array",
    ) for i in range(online_config.num_envs)]
)

lstm_agent = LSTMPPOAgent(envs=envs, environment_config=environment_config, lstm_config=lstm_config, device=t.device("cpu"))
memory = Memory(envs, online_config, device=t.device("cpu"))
lstm_agent.rollout(memory, online_config.num_steps, envs)

  self.mask = 1 - t.tensor(done, device=self.device, dtype=t.float)


In [3]:
from src.ppo.loss_functions import calc_clipped_surrogate_objective, calc_value_function_loss, calc_entropy_bonus
import torch.nn as nn

update_epochs = online_config.update_epochs
args = online_config
num_updates = online_config.total_timesteps // online_config.batch_size
optimizer, scheduler = lstm_agent.make_optimizer(
    num_updates,
    online_config.learning_rate,
    online_config.learning_rate * 1e-4
)

for _ in range(update_epochs):
    minibatches = memory.get_minibatches()
    # Compute loss on each minibatch, and step the optimizer
    for mb in minibatches:
        obs = lstm_agent.preprocess_obs(DictList(mb.obs))
        results = lstm_agent.model(obs, mb.recurrence_memory) # shouldn't this be from the previous timestep?
        probs = results['dist']
        values = results['value']
        recurrence_memory = results['memory']
        clipped_surrogate_objective = calc_clipped_surrogate_objective(
            probs, mb.actions, mb.advantages, mb.logprobs, args.clip_coef)
        value_loss = calc_value_function_loss(
            values, mb.returns, args.vf_coef)
        entropy_bonus = calc_entropy_bonus(probs, args.ent_coef)
        total_objective_function = clipped_surrogate_objective - value_loss + entropy_bonus
        optimizer.zero_grad()
        total_objective_function.backward()
        nn.utils.clip_grad_norm_(lstm_agent.model.parameters(), args.max_grad_norm)
        optimizer.step()

# Step the scheduler
scheduler.step()