# PPO Transformer Model



In [28]:
from src.environments.memory import MemoryEnv
from minigrid.wrappers import ViewSizeWrapper
import plotly.express as px 

env = ViewSizeWrapper(MemoryEnv(size = 7, random_length=False, random_start_pos=False, max_steps=200, render_mode='rgb_array'), 7)
obs, info = env.reset()
px.imshow(env.render()).show()
px.imshow(obs['image'][:,:,0].T).show()

In [20]:
obs, _,_,_,_ = env.step(1)
px.imshow(env.render()).show()
px.imshow(obs['image'][:,:,0].T).show()

# test out probe env 6

In [1]:
import os

import gymnasium as gym

from src.config import (EnvironmentConfig, OnlineTrainConfig, RunConfig,
                        TransformerModelConfig, LSTMModelConfig)
from src.ppo.my_probe_envs import Probe1, Probe2, Probe3, Probe4, Probe5, Probe6, Probe7
from src.ppo.runner import ppo_runner
import src.environments.registration
from src.ppo.train import train_ppo

# load probe envs so we can use them to debug.
for i in range(7):
    probes = [Probe1, Probe2, Probe3, Probe4, Probe5, Probe6, Probe7]
    gym.envs.registration.register(id=f"Probe{i+1}-v0", entry_point=probes[i])




Registering DynamicObstaclesMultiEnv-v0
Registering CrossingMultiEnv-v0
Registering Probe Envs


In [2]:
from src.environments.environments import make_env
env = Probe7()
obs, info = env.reset()
for i in range(10):
    obs, reward, done, truncated, info  = env.step(1)
    print(obs,reward, done, env.time_step)


[0.] 0.0 False 1
[0.] 0.0 False 2
[0.] 0.0 False 3
[[1.]] 1.0 True 4
[0.] 0.0 False 5
[0.] 0.0 False 6
[0.] 0.0 False 7
[0.] 0.0 False 8
[0.] 0.0 False 9
[0.] 0.0 False 10


In [3]:
import torch as t
from dataclasses import dataclass

@dataclass
class DummyRunConfig:
    exp_name: str = 'test'
    seed: int = 1
    track: bool = False
    wandb_project_name: str = 'test'
    wandb_entity: str = 'test'
    device: t.device = t.device("cpu")

run_config = DummyRunConfig()


@dataclass
class DummyEnvironmentConfig:
    env_id: str = 'MiniGrid-Dynamic-Obstacles-8x8-v0'
    one_hot_obs: bool = False
    img_obs: bool = False
    fully_observed: bool = False
    max_steps: int = 1000
    seed: int = 1
    view_size: int = 7
    capture_video: bool = False
    video_dir: str = 'videos'
    render_mode: str = 'rgb_array'
    action_space: None = None
    observation_space: None = None
    device: t.device = t.device("cpu")

environment_config = DummyEnvironmentConfig()

@dataclass
class DummyOnlineConfig:
    use_trajectory_model: bool = False
    hidden_size: int = 64
    total_timesteps: int = 1000
    learning_rate: float = 0.00025
    decay_lr: bool = False,
    num_envs: int = 10
    num_steps: int = 128
    gamma: float = 0.99
    gae_lambda: float = 0.95
    num_minibatches: int = 10
    update_epochs: int = 4
    clip_coef: float = 0.2
    ent_coef: float = 0.01
    vf_coef: float = 0.5
    max_grad_norm: float = 2
    trajectory_path: str = None
    fully_observed: bool = False
    batch_size: int = 64
    minibatch_size: int = 4
    prob_go_from_end: float = 0.0
    device: t.device = t.device("cpu")

online_config = DummyOnlineConfig()

In [None]:
env_name = "Probe7-v0"
env_config = EnvironmentConfig(env_id=env_name, render_mode=None, max_steps=None, fully_observed=False)
envs = gym.vector.SyncVectorEnv(
    [make_env(env_config, i, i, "test") for i in range(4)]
)

# currently, ppo has tests which run inside main if it
# detects "Probe" in the env name. We will fix this
# eventually.
environment_config.env_id = env_name
environment_config.action_space = envs.single_action_space
online_config.total_timesteps = 2000
model_config=LSTMModelConfig(environment_config, recurrence=5, arch="simple_endpool_res", use_memory=True)

agent = train_ppo(
    run_config=run_config,
    online_config=online_config,
    environment_config=environment_config,
    model_config=model_config,
    envs=envs
)

In [None]:
from gymnasium.spaces import Box

In [None]:
Box(0,1).sample()

array([0.5043442], dtype=float32)

In [3]:
env_name = "Probe3-v0"
env_config = EnvironmentConfig(env_id=env_name, render_mode=None, max_steps=None, fully_observed=False)
envs = gym.vector.SyncVectorEnv(
    [make_env(env_config, i, i, "test") for i in range(4)]
)
envs.reset()[0].shape

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


(4, 1)

In [4]:
env_name = "Probe7-v0"
env_config = EnvironmentConfig(env_id=env_name, render_mode=None, max_steps=None, fully_observed=False)
envs = gym.vector.SyncVectorEnv(
    [make_env(env_config, i, i, "test") for i in range(4)]
)
envs.reset()[0].shape

(4, 1)