# MiniGrid Environment

Try out the environment by running the following command:


```bash
python -m minigrid.manual_control
```

Later we can benchmark against torch-rl 


In [9]:
import gymnasium as gym
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper

env = gym.make('MiniGrid-Empty-8x8-v0')
# env = RGBImgPartialObsWrapper(env) # Get pixel observations
env = ImgObsWrapper(env) # Get rid of the 'mission' field
obs, _ = env.reset() # This now produces an RGB tensor only
# obs

In [None]:
import torch as t 
import plotly.express as px
obs = t.tensor(obs)
obs.shape
px.imshow(obs)


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [None]:
env = gym.make('MiniGrid-Empty-8x8-v0')
env = RGBImgPartialObsWrapper(env) # Get pixel observations
env = ImgObsWrapper(env) # Get rid of the 'mission' field
obs, _ = env.reset() # This now produces an RGB tensor only

# take several actions, store the observations, actions, returns and timesteps
all_obs = []
all_actions = []
all_returns = []
all_timesteps = []


for i in range(10):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    all_obs.append(obs)
    all_actions.append(action)
    all_returns.append(reward)
    all_timesteps.append(i)

# convert to tensors.unsqueeze(0)
all_obs = t.tensor(all_obs)
all_actions = t.tensor(all_actions).reshape(-1, 1)
all_returns = t.tensor(all_returns)
all_returns = t.randn((10, 1))
all_returns_to_go = all_returns.flip(0).cumsum(0).flip(0).reshape(-1, 1)
all_timesteps = t.tensor(all_timesteps).reshape(-1, 1)


Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:233.)



In [None]:
print(all_returns.shape)

torch.Size([10, 1])


In [None]:
print(all_returns_to_go.shape)

torch.Size([10, 1])


# Getting a basic architecture


In [None]:
# for the grid world environment we will a small CNN to extract features from the image
# we will use the same CNN as in the original paper

obs, _, _, _, _ = env.step(2)
obs = t.tensor(obs)

import torch as t
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange

# to do: make this a custom class with hooks from transformer lense
# to do: work out how to feature visualize this

class StateEncoder(nn.Module):
    def __init__(self, n_embed):
        super(StateEncoder, self).__init__()
        self.n_embed = n_embed
        # input has shape 56 x 56 x 3
        # output has shape 1 x 1 x 512
        self.conv1 = nn.Conv2d(3, 32, 8, stride=4, padding=0) # 56 -> 13
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2, padding=0) # 13 -> 5
        self.conv3 = nn.Conv2d(64, 64, 3, stride=1, padding=0) # 5 -> 3
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(576, n_embed)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flatten(x)
        x = self.fc(x)
        x = F.relu(x)
        return x

# we will use the same CNN as in the original paper
cnn = StateEncoder(64).to("cpu")
x = obs.unsqueeze(0).to(t.float32)
x = rearrange(x, 'b h w c-> b c h w')
cnn(x)

tensor([[ 0.0000,  0.0000,  4.6144,  1.2588,  0.1821,  0.0000,  0.0000,  0.0000,
          0.0000, 10.2853,  5.1550,  0.0000,  0.0000,  0.0000,  0.0777,  0.8252,
          1.6667,  3.1823,  1.6241,  0.0000,  2.2314,  1.1757,  1.5849,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  8.0670,
          2.2828,  0.0000,  0.0000,  4.6453,  0.0000,  7.2832,  0.0000,  1.5017,
          0.0000,  1.6794,  0.0000,  0.0000,  0.0000,  0.0000,  6.0537,  0.0000,
          9.3869,  3.8999,  0.0000,  1.0585,  0.0000,  0.0000,  0.0000,  4.4552,
          0.0000,  0.0000,  3.5174,  0.0000,  0.0000,  2.0047,  0.5330,  0.0000]],
       grad_fn=<ReluBackward0>)

For reference: https://github.com/kzl/decision-transformer/blob/master/atari/mingpt/model_atari.py

In [None]:

    # def forward(self, R: t.tensor, s: t.tensor, a: t.tensor, t: t.tensor):
    #     '''
    #     R: return
    #     s: state
    #     a: action
    #     t: timestep
    #     '''
    #     s = s.to(torch.float32)
    #     s = rearrange(s, 'b h w c-> b c h w')

    #     pos_emb = self.pos_embedding(t)
    #     state_emb = self.state_embedding(s) + pos_emb
    #     action_emb = self.action_embeddings(a) + pos_emb
    #     ret_emb = self.ret_emb(R) + pos_emb

    #     input_embeds = torch.stack([state_emb, action_emb, ret_emb], dim=1)
    #     print(input_embeds.shape)
    #     print(input_embeds.dtype)
    #     input_embeds = rearrange(input_embeds, 'batch sar block_size d_model -> batch sar block_size d_model')
        
    #     x = self.transformer(input_embeds)
        
    #     return x


Notes:
- This turned out to be really complicated. 
- Specifically:
    - it seems like the model has very different formulations during reward_conditioned vs naive. And also changes size if targets are used.
    - This includes things like the token embeddings for actions not going through in naive mode, but going through in reward_conditioned mode.
        - This is fine because a model is only ever of one type
    - However, the model isn't using padding of any kind? I should look for evidence of this. 

In [None]:
all_timesteps.unsqueeze(1).shape

torch.Size([10, 1, 1])

In [None]:
all_timesteps.repeat(3,1, 1).shape

torch.Size([3, 10, 1])

In [None]:
print(all_actions.repeat(3,1, 1).shape)
print(all_returns_to_go.repeat(3,1, 1).shape)
print(all_timesteps.repeat(3,1, 1).shape)
print(all_obs.repeat(3,1, 1, 1, 1).shape)

torch.Size([3, 10, 1])
torch.Size([3, 10, 1])
torch.Size([3, 10, 1])
torch.Size([3, 10, 56, 56, 3])


In [None]:
rearrange(t.tensor([[1,2],[4,5],[7,8]]), 'b (n d) -> b n d', n=1, d=2)

tensor([[[1, 2]],

        [[4, 5]],

        [[7, 8]]])

In [None]:

# reference code:

# all_global_pos_emb = torch.repeat_interleave(self.global_pos_emb, batch_size, dim=0) # batch_size, traj_length, n_embd
# position_embeddings = torch.gather(all_global_pos_emb, 1, torch.repeat_interleave(timesteps, self.config.n_embd, dim=-1)) + self.pos_emb[:, :token_embeddings.shape[1], :]
# x = self.drop(token_embeddings + position_embeddings)

In [None]:
import torch as t 
import gymnasium as gym
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper

env = gym.make('MiniGrid-Empty-8x8-v0')
env = RGBImgPartialObsWrapper(env) # Get pixel observations
env = ImgObsWrapper(env) # Get rid of the 'mission' field
obs, _ = env.reset() # This now produces an RGB tensor only

# take several actions, store the observations, actions, returns and timesteps
all_obs = []
all_actions = []
all_returns = []
all_timesteps = []


for i in range(10):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    all_obs.append(obs)
    all_actions.append(action)
    all_returns.append(reward)
    all_timesteps.append(i)

# convert to tensors.unsqueeze(0)
all_obs = t.tensor(all_obs).to(t.float32).unsqueeze(0)
all_actions = t.tensor(all_actions).reshape(-1, 1).unsqueeze(0)
all_returns = t.randn((10, 1))
all_returns_to_go = all_returns.flip(0).cumsum(0).flip(0).reshape(-1, 1).unsqueeze(0)
all_timesteps = t.tensor(all_timesteps).reshape(-1, 1).unsqueeze(0)


# Train a decision transformer on minigrid

To do this we will need:
- example trajectories (offline learning)
- a training loop

To make the sample trajectories we need:
- an agent which can traverse those maps (better if it's something good like PPO) (Agent class)
- a way to sample trajectories from that agent (preferably parallelizable)

In [None]:
import numpy as np 
from typing import Union
ActType = Union[int, np.ndarray]

class Agent:
    '''Base class for agents in a multi-armed bandit environment (you do not need to add any implementation here)'''

    rng: np.random.Generator

    def __init__(self, num_arms: int, seed: int):
        self.num_arms = num_arms
        self.reset(seed)

    def get_action(self) -> ActType:
        raise NotImplementedError()

    def observe(self, action: ActType, reward: float, info: dict) -> None:
        pass

    def reset(self, seed: int) -> None:
        self.rng = np.random.default_rng(seed)

class RandomAgent(Agent):
    def __init__(self, env):
        self.env = env
    def get_action(self):
        return self.env.action_space.sample()

def run_episode(env: gym.Env, agent: Agent, seed: int):
    rewards = []
    actions = []
    states = []
    env.reset(seed=seed)
    agent.reset(seed=seed)
    done = False
    while not done:
        arm = agent.get_action()
        actions.append(arm)
        (obs, reward, done, truncated, info) = env.step(arm)
        agent.observe(arm, reward, info)
        states.append(obs)
        rewards.append(reward)
    rewards = np.array(rewards, dtype=float)
    actions = np.array(actions, dtype=int)

    return rewards, np.array(states), actions

env = gym.make('MiniGrid-Empty-5x5-v0')
env = RGBImgPartialObsWrapper(env) # Get pixel observations
env = ImgObsWrapper(env) # Get rid of the 'mission' field
agent = RandomAgent(env)

reward_trajs = []
states_trajs = []
actions_trajs = []
for event in range(1000):
    reward_traj, states_traj, actions_traj = run_episode(env, agent, seed=i)
    reward_trajs.append(reward_traj)
    states_trajs.append(states_traj)
    actions_trajs.append(actions_traj)

# gym.vector.SyncVectorEnv(
#     env_fns=[lambda: gym.make('MiniGrid-Empty-5x5-v0') for _ in range(10)],
# )

reward_trajs = np.array(reward_trajs)
states_trajs = np.array(states_trajs)
actions_trajs = np.array(actions_trajs)



Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



In [None]:
print(reward_trajs.shape)
print(states_trajs.shape)
print(actions_trajs.shape)

(1000,)
(1000,)
(1000,)


In [None]:
lengths = np.array([len(reward_traj) for reward_traj in reward_trajs])
print(lengths.shape)

(1000,)


In [None]:
import plotly.express as px 
px.histogram(lengths)

In [None]:
# let's try an example with a single trajectory
reward_traj = reward_trajs[0]
states_traj = states_trajs[0]
actions_traj = actions_trajs[0]

reward_traj

rtg = np.flip(reward_traj).cumsum(0)



decision_transformer = DecisionTransformer(env, max_game_length= 10**4)

logits, _ = decision_transformer(
    states = t.tensor(states_traj).to(t.float32).unsqueeze(0),
    actions = t.tensor(actions_traj).unsqueeze(0).unsqueeze(-1),
    rtgs = t.tensor(rtg).unsqueeze(0).unsqueeze(-1),
    timesteps = t.tensor(np.arange(len(reward_traj))).unsqueeze(0).unsqueeze(-1)
)

ValueError: Trajectory length is greater than the maximum sequence length for this model

In [None]:
t.tensor(reward_traj).unsqueeze(-1).unsqueeze(0).shape

torch.Size([1, 180, 1])

In [None]:
t.tensor(np.arange(len(reward_traj))).unsqueeze(0).unsqueeze(-1).shape

torch.Size([1, 180, 1])

In [None]:
gym.__version__

'0.27.0'

# See if I can load a replay buffer from D4RL


In [5]:
import d4rl
import gym 
import minigrid
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper
from warnings import simplefilter
simplefilter(action='ignore', category=DeprecationWarning)
env = gym.make('maze2d-eval-medium-v1')
# _ = env.reset() # This now produces an RGB tensor only
env.get_dataset()

No module named 'mjrl'
No module named 'flow'
No module named 'carla'
pybullet build time: Dec 21 2022 09:14:19
load datafile: 100%|██████████| 8/8 [00:00<00:00,  9.24it/s]


{'actions': array([[ 0.12521252, -0.6819198 ],
        [-1.        , -1.        ],
        [-0.24480188, -0.59214544],
        ...,
        [-0.86986226, -1.        ],
        [-1.        , -0.36177555],
        [-1.        , -0.15718542]], dtype=float32),
 'infos/goal': array([[5.995181 , 0.912286 ],
        [5.995181 , 0.912286 ],
        [5.995181 , 0.912286 ],
        ...,
        [0.9864979, 6.004362 ],
        [0.9864979, 6.004362 ],
        [0.9864979, 6.004362 ]], dtype=float32),
 'infos/qpos': array([[4.944978 , 4.084466 ],
        [4.945365 , 4.083295 ],
        [4.943369 , 4.0797443],
        ...,
        [3.6793704, 2.89519  ],
        [3.6387599, 2.9047458],
        [3.5958643, 2.9134173]], dtype=float32),
 'infos/qvel': array([[ 0.0088652 ,  0.04536624],
        [ 0.03866518, -0.11715045],
        [-0.19959074, -0.35503528],
        ...,
        [-3.8630888 ,  1.19659   ],
        [-4.061058  ,  0.95557624],
        [-4.28955   ,  0.86713856]], dtype=float32),
 'observati

After a huge amount of work this seems not good. I will need to train my own agent.

# Training on PPO


### let's try torch rl 

```bash
python3 -m scripts.train --algo ppo --env MiniGrid-DoorKey-5x5-v0 --model DoorKey --save-interval 10 --frames 80000
python3 -m scripts.visualize --env MiniGrid-DoorKey-5x5-v0 --model DoorKey
python3 -m scripts.evaluate --env MiniGrid-DoorKey-5x5-v0 --model DoorKey
```

Unfortunately, it doesn't appear super simple to use these models because we need to actually load all their classes and stuff. Can we make this work?

In [None]:
import gymnasium as gym
import minigrid
env = gym.make('MiniGrid-DoorKey-5x5-v0') # are there any default wrappers?

In [None]:
# let's write a script which will generate a dataset from our trained ppo agent
# load the agent 

List of things to fix:
- gym env is gymanium, import minigrid
- import Discrete for type check from gymnasium

In [1]:
from src.ppo.train import train_ppo
from src.ppo.utils import PPOArgs
from src.utils import TrajectoryWriter
import warnings 
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category= DeprecationWarning)

    args = PPOArgs(
        exp_name = 'CartPole-v1',
        env_id = 'CartPole-v1',
        num_envs = 10,
        track = False,
        wandb_project_name="PPO-MiniGrid-test with cartpole",
        capture_video=True,
        cuda = False,
        total_timesteps=100000,
        max_steps=None)

    trajectory_writer = TrajectoryWriter(args.trajectory_path, args)

    ppo = train_ppo(args, trajectory_writer)

  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = getattr(cls, name)
  value = geta

Output(layout=Layout(padding='15px'))

100%|██████████| 78/78 [00:08<00:00,  8.83it/s]

Trajectory written to trajectories/CartPole-v1.pkl





{'args': {'exp_name': 'CartPole-v1', 'seed': 1, 'cuda': False, 'track': False, 'wandb_project_name': 'PPO-MiniGrid-test with cartpole', 'wandb_entity': None, 'capture_video': True, 'env_id': 'CartPole-v1', 'total_timesteps': 100000, 'learning_rate': 0.00025, 'num_envs': 10, 'num_steps': 128, 'gamma': 0.99, 'gae_lambda': 0.95, 'num_minibatches': 4, 'update_epochs': 4, 'clip_coef': 0.2, 'ent_coef': 0.01, 'vf_coef': 0.5, 'max_grad_norm': 0.5, 'max_steps': None, 'trajectory_path': 'trajectories/CartPole-v1.pkl'}, 'time': 1671699775.433394}


In [21]:
from src.ppo.train import train_ppo
from src.ppo.utils import PPOArgs
from src.utils import TrajectoryWriter
import warnings 

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category= DeprecationWarning)

    args = PPOArgs(
        exp_name = 'MiniGrid-DistShift1-v0',
        env_id = 'MiniGrid-DistShift1-v0',
        num_envs = 4,
        num_steps=128,
        track = True,
        wandb_project_name="PPO-MiniGrid",
        capture_video=True,
        cuda = False,
        total_timesteps=80*10000,
        max_steps=200)

    trajectory_writer = TrajectoryWriter(args.trajectory_path, args)

    ppo = train_ppo(args, trajectory_writer=trajectory_writer)

Output(layout=Layout(padding='15px'))

100%|██████████| 1562/1562 [05:11<00:00,  5.01it/s]


Trajectory written to trajectories/MiniGrid-DistShift1-v0.pkl


VBox(children=(Label(value='1.391 MB of 1.600 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.869298…

0,1
approx_kl,▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▂▁▁▂▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁
avg_value,▁▁▂▃▄▅▆▇▇▇▇▇▇███████▇████████▆█▇███████▇
clipfrac,▁▂▁▁▂▂▁▂▁▆▁▂▄▁▂▂▃▂▃▁▃▁▁▂▁▁▂▁▁▂▁█▂▁▂▁▁▁▁▁
clipped_surrogate_objective,▃▄▃▃▄▄▄▄▄▁▄▄▅▃▅▄▄▄▂▃▆▄▄▃▂▃▆▃▅▄▃█▄▄▅▃▄▃▃▃
entropy,██▇▇▅▆▅▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁
episode_length,█▇█▅▃▄▃▃▃▃▃▃▃▂▃▃▃▂▂▂▂▂▂▂▂▂▁▃▂▂▂▂▂▂▂▂▂▂▂▂
episode_return,▇█▇███████████████████████▁█████████████
learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
value_loss,▁▁▂█▅▅▆▂▂▅▆▂▁▁▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▂▁▁▁▁▁▁▁▁

0,1
approx_kl,0.0
avg_value,0.92495
clipfrac,0.0
clipped_surrogate_objective,0.0
entropy,0.00011
episode_length,13.0
episode_return,0.9766
learning_rate,0.0
value_loss,2e-05


# Turning the stored trajectories into a dataset

In [92]:
from src.utils import TrajectoryReader
import numpy as np

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category= FutureWarning)

    traj_reader = TrajectoryReader("/Users/josephbloom/GithubRepositories/DecisionTransformerInterpretability/trajectories/MiniGrid-DistShift1-v0.pkl")
    data= traj_reader.read()

    print(data['metadata'])

    observations = data['data'].get('observations')
    actions = data['data'].get('actions')
    rewards = data['data'].get('rewards')
    dones = data['data'].get('dones')
    infos = data['data'].get('infos')

    observations = np.array(observations)
    actions = np.array(actions)
    rewards = np.array(rewards)
    dones = np.array(dones)
    infos = np.array(infos, dtype=np.ndarray)



print(observations.shape)
print(observations[0].dtype)
print(actions.shape)
print(rewards.shape)
print(dones.shape)
print(infos.shape)


# let's flatten our observations,actions, dones and infos
import torch as t 
from einops import rearrange
t_observations = rearrange(t.tensor(observations), "t b h w c -> (b t) h w c")
t_actions = rearrange(t.tensor(actions), "t b -> (b t)")
t_rewards = rearrange(t.tensor(rewards), "t b -> (b t)")
t_dones = rearrange(t.tensor(dones), "t b -> (b t)")

print(t_observations.shape)
print(t_actions.shape)
print(t_rewards.shape)
print(t_dones.shape)


{'args': {'exp_name': 'MiniGrid-DistShift1-v0', 'seed': 1, 'cuda': False, 'track': True, 'wandb_project_name': 'PPO-MiniGrid', 'wandb_entity': None, 'capture_video': True, 'env_id': 'MiniGrid-DistShift1-v0', 'total_timesteps': 800000, 'learning_rate': 0.00025, 'num_envs': 4, 'num_steps': 128, 'gamma': 0.99, 'gae_lambda': 0.95, 'num_minibatches': 4, 'update_epochs': 4, 'clip_coef': 0.2, 'ent_coef': 0.01, 'vf_coef': 0.5, 'max_grad_norm': 0.5, 'max_steps': 500, 'trajectory_path': 'trajectories/MiniGrid-DistShift1-v0.pkl'}, 'time': 1671723079.665403}
(199936, 4, 9, 7, 3)
float64
(199936, 4)
(199936, 4)
(199936, 4)
(199936,)
torch.Size([799744, 9, 7, 3])
torch.Size([799744])
torch.Size([799744])
torch.Size([799744])


# Trying to refactor the original data generation code

decision_transformer_original/gym/experiment.py

In [93]:
import plotly.express as px
px.histogram(np.diff(np.where(data['data']['dones'])[0]))


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [114]:
import torch
done_indices = torch.where(t_dones)[0]

actions = torch.tensor_split(t_actions, done_indices)
rewards = torch.tensor_split(t_rewards, done_indices+1)
returns = [r.sum() for r in rewards]
states = torch.tensor_split(t_observations, done_indices)
timesteps = [t.arange(len(i)) for i in states]
traj_lens = [len(i) for i in states]

num_timesteps = sum(traj_lens)
# used for input normalization
all_states = np.concatenate(states, axis=0)
state_mean, state_std = np.mean(all_states, axis=0), np.std(all_states, axis=0) + 1e-6

num_trajectories = len(states)

tensor([    37,     74,    386,  ..., 799709, 799722, 799735])

In [107]:
# px.histogram(returns)

In [89]:
pct_traj = 1
num_timesteps = max(int(pct_traj*num_timesteps), 1)
sorted_inds = np.argsort(returns)  # lowest to highest
num_trajectories = 1
timesteps = traj_lens[sorted_inds[-1]]
ind = num_trajectories - 2
while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] <= num_timesteps:
    timesteps += traj_lens[sorted_inds[ind]]
    num_trajectories += 1
    ind -= 1
sorted_inds = sorted_inds[-num_trajectories:]


TypeError: only integer scalar arrays can be converted to a scalar index

In [90]:
p_sample = traj_lens[sorted_inds] / sum(traj_lens[sorted_inds])

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
# used to reweight sampling so we sample according to timesteps instead of trajectories
p_sample = traj_lens[sorted_inds] / sum(traj_lens[sorted_inds])

In [None]:
import torch  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import numpy as np 
K = 100
state_dim = states[0].shape()

def get_batch(batch_size=256, max_len=K):
    batch_inds = np.random.choice(
        np.arange(num_trajectories),
        size=batch_size,
        replace=True,
        p=p_sample,  # reweights so we sample according to timesteps
    )

    s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []
    for i in range(batch_size):
        traj = trajectories[int(sorted_inds[batch_inds[i]])]
        si = random.randint(0, traj['rewards'].shape[0] - 1)

        # get sequences from dataset
        s.append(traj['observations'][si:si + max_len].reshape(1, -1, state_dim))
        a.append(traj['actions'][si:si + max_len].reshape(1, -1, act_dim))
        r.append(traj['rewards'][si:si + max_len].reshape(1, -1, 1))
        if 'terminals' in traj:
            d.append(traj['terminals'][si:si + max_len].reshape(1, -1))
        else:
            d.append(traj['dones'][si:si + max_len].reshape(1, -1))
        timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
        timesteps[-1][timesteps[-1] >= max_ep_len] = max_ep_len-1  # padding cutoff
        rtg.append(discount_cumsum(traj['rewards'][si:], gamma=1.)[:s[-1].shape[1] + 1].reshape(1, -1, 1))
        if rtg[-1].shape[1] <= s[-1].shape[1]:
            rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

        # padding and state + reward normalization
        tlen = s[-1].shape[1]
        s[-1] = np.concatenate([np.zeros((1, max_len - tlen, state_dim)), s[-1]], axis=1)
        s[-1] = (s[-1] - state_mean) / state_std
        a[-1] = np.concatenate([np.ones((1, max_len - tlen, act_dim)) * -10., a[-1]], axis=1)
        r[-1] = np.concatenate([np.zeros((1, max_len - tlen, 1)), r[-1]], axis=1)
        d[-1] = np.concatenate([np.ones((1, max_len - tlen)) * 2, d[-1]], axis=1)
        rtg[-1] = np.concatenate([np.zeros((1, max_len - tlen, 1)), rtg[-1]], axis=1) / scale
        timesteps[-1] = np.concatenate([np.zeros((1, max_len - tlen)), timesteps[-1]], axis=1)
        mask.append(np.concatenate([np.zeros((1, max_len - tlen)), np.ones((1, tlen))], axis=1))

    s = torch.from_numpy(np.concatenate(s, axis=0)).to(dtype=torch.float32, device=device)
    a = torch.from_numpy(np.concatenate(a, axis=0)).to(dtype=torch.float32, device=device)
    r = torch.from_numpy(np.concatenate(r, axis=0)).to(dtype=torch.float32, device=device)
    d = torch.from_numpy(np.concatenate(d, axis=0)).to(dtype=torch.long, device=device)
    rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).to(dtype=torch.float32, device=device)
    timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).to(dtype=torch.long, device=device)
    mask = torch.from_numpy(np.concatenate(mask, axis=0)).to(device=device)

    return s, a, r, d, rtg, timesteps, mask

In [122]:
import gymnasium as gym
import plotly.express as px
from src.visualization import render_minigrid_observations, render_minigrid_observation

from minigrid.core.constants import IDX_TO_OBJECT
import numpy as np
import torch

def find_agent(observation):
    height = observation.shape[0]
    width = observation.shape[1]
    for i in range(width):
        for j in range(height):
            object = IDX_TO_OBJECT[int(observation[j,i][0])]
            if object == 'agent':
                return j, i


def render_minigrid_observation(env, observation):
    if isinstance(observation, np.ndarray):
        observation = observation.copy() # so we don't edit the original object
    elif isinstance(observation, torch.Tensor):
        observation = observation.numpy().copy()

    agent_pos = find_agent(observation)
    agent_dir = observation[agent_pos[0], agent_pos[1]][2]

    observation[agent_pos[0], agent_pos[1]] = [0,0,0]

    grid, _ = env.grid.decode(observation.astype(np.uint8))
    
    i = agent_pos[0]
    j = agent_pos[1]
    
    return grid.render(32, (i,j), agent_dir=agent_dir)

def render_minigrid_observations(env, observations):
    return np.array([render_minigrid_observation(env, observation) for observation in observations])


env = gym.make(data['metadata']['args']['env_id'], render_mode = 'rgb_array')
print(data['metadata']['args']['env_id'])
_, _ = env.reset()

from minigrid.core.actions import Actions

print([Actions(int(i)) for i in actions[1000]])
print(rewards[1000])
print(returns[1000])
imgs = render_minigrid_observations(env, states[1000])
fig = px.imshow(imgs, animation_frame=0)
fig.show()

MiniGrid-DistShift1-v0
[<Actions.forward: 2>, <Actions.left: 0>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.left: 0>, <Actions.left: 0>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.left: 0>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.forward: 2>, <Actions.left: 0>, <Actions.forward: 2>]
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9694],
       dtype=torch.float64)
tensor(0.9694, dtype=torch.float64)



distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [123]:
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=1, cols=2, subplot_titles=('Title1', 'Title2'),
    horizontal_spacing=0.051
)

fig.add_trace(go.Bar(x=['A', 'B', 'C', 'D'], y=[4, 2, 1, 5]), row=1, col=1) #this is the trace of index 0
fig.add_trace(go.Scatter(x=['A', 'B', 'C', 'D'], y=[2, 1.45, 0.25, 2.1],
                        line_width=3), row=1, col=1)   # trace of index 1

fig.add_trace(go.Scatter(x=np.arange(10),
                         y=1+3*np.random.rand(10),
                        marker_size=6), row=1, col=2)  #trace of index 2


#traces=[0, 1, 2]` in the frame definition makes the difference: it tells that 
#the traces of index 0, 1 from the subplot(1,1), are unchanged, and we only ensure their visibility in each #frame (because neither x nor y are modified)
#while the trace 2 from the subplot(1,2) is animated, because the y-values are changed. 

frames =[go.Frame(data=[go.Bar(visible=True),
                        go.Scatter(visible=True),
                        go.Scatter(y=2+3*np.random.rand(10))],
                  traces=[0,1,2]) for k in range(20)]   # define 20 frames

fig.frames=frames
button = dict(
             label='Play',
             method='animate',
             args=[None, dict(frame=dict(duration=50, redraw=False), 
                              transition=dict(duration=0),
                              fromcurrent=True,
                              mode='immediate')])
fig.update_layout(updatemenus=[dict(type='buttons',
                              showactive=False,
                              y=0,
                              x=1.05,
                              xanchor='left',
                              yanchor='bottom',
                              buttons=[button] )
                                      ],
                 width=800, height=500)
                              
fig.update_layout(yaxis2_range=[0, 5.5], yaxis2_autorange=False)


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.

