In [47]:
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from itertools import chain
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import torch.optim as optim
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
import time

In [48]:
# set seed of all polssible random number generators
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [49]:
def read_data_frame():
    df = pd.read_csv("all_data.csv")
    episodes = df.groupby("Episode")
    episode_data = {}
    for episode, data in episodes:
        episode_data[episode] = [
            [row["X"], row["Y"], row["Action"], row["Reward"]]
            for index, row in data.iterrows()
        ]
    return episode_data


episode_data = [episode for episode in read_data_frame().values()]

In [50]:
# shuffle the data and split it into training and testing
np.random.shuffle(episode_data)
train_data = episode_data[: int(len(episode_data) * 0.8)]
test_data = episode_data[int(len(episode_data) * 0.8) :]

In [51]:
class CFG:
    block_size = 512
    start_token = 0
    padding_token = 1
    end_token = 3

    batch_size = 64

In [52]:
class Tokenizer:
    mapper = dict(
        {
            CFG.start_token: CFG.start_token,
            CFG.padding_token: CFG.padding_token,
            CFG.end_token: CFG.end_token,
        }
    )
    token_counter = 3

    def __init__(self, cfg: CFG):
        self.cfg = cfg

    # gets one unicode number. It should check if the unicode number is already in the mapper. If not, it should add it. Returns the number.
    def get_tokenized_unicode(self, x):
        if x not in self.mapper:
            self.mapper[x] = self.token_counter
            self.token_counter += 1
        return self.mapper[x]

    def encode(self, x: np.array):
        x = np.array(x)
        tokens = []
        for i in range(0, len(x)):
            step = np.array(x[i])
            step = ",".join(step.astype(str))
            for i in range(len(step)):
                unicode_numb = ord(step[i])
                tokens.append(self.get_tokenized_unicode(unicode_numb))
        return [self.mapper[CFG.start_token]] + tokens + [self.mapper[CFG.end_token]]

    def cut_to_max_len(self, x):
        tokenized_steps = []
        length = 0
        for i in range(0, len(x)):
            step = x[i]
            tokenized_step = self.encode([step])
            if length + len(tokenized_step) > CFG.block_size - 2:
                return x[:i]
            tokenized_steps.append(tokenized_step)
            length += len(tokenized_step)

        return x

    def pad(self, x):
        return x + [CFG.padding_token] * (CFG.block_size - len(x))

    def __call__(self, x):
        return self.pad(self.encode(x))

In [53]:
tokenizer = Tokenizer(CFG)

In [54]:
class A3CDataset(torch.utils.data.Dataset):
    def __init__(self, global_history, tokenizer, use_crop=True):
        self.global_history = global_history
        self.tokenizer = tokenizer
        self.use_crop = use_crop

    def __len__(self):
        return len(self.global_history)

    def crop(self, arr):
        if len(arr) > CFG.block_size:
            arr = arr[: CFG.block_size]

        take_first = np.random.randint(2, len(arr))
        target = arr[-1]
        arr = arr[: take_first - 1]
        return arr, target

    def __getitem__(self, idx):
        learning_history = self.global_history[idx]

        if self.use_crop:
            learning_history = self.tokenizer.cut_to_max_len(learning_history)
            learning_history, target = self.crop(learning_history)

        tokenized = self.tokenizer(learning_history)
        tensor = torch.tensor(np.array(tokenized))
        action = torch.tensor(target[2])
        return tensor, action

In [55]:
len(train_data)

4830

In [56]:
len(A3CDataset(train_data, tokenizer=tokenizer))

4830

In [57]:
train_data_loader = DataLoader(
    A3CDataset(train_data, tokenizer=tokenizer),
    batch_size=CFG.batch_size,
    shuffle=False,
)
test_data_loader = DataLoader(
    A3CDataset(test_data, tokenizer=tokenizer), batch_size=CFG.batch_size, shuffle=False
)

In [58]:
print(f"sample: {next(iter(train_data_loader))}")

sample: [tensor([[ 0,  3,  4,  ...,  1,  1,  1],
        [ 0, 11,  4,  ...,  1,  1,  1],
        [ 0,  3,  4,  ...,  1,  1,  1],
        ...,
        [ 0,  3,  4,  ...,  1,  1,  1],
        [ 0,  3,  4,  ...,  1,  1,  1],
        [ 0, 11,  4,  ...,  1,  1,  1]]), tensor([2, 4, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0,
        2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
        0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0])]


In [59]:
tokenizer.mapper

{0: 0,
 1: 1,
 3: 3,
 52: 3,
 46: 4,
 48: 5,
 44: 6,
 51: 7,
 45: 8,
 49: 9,
 50: 10,
 53: 11,
 54: 12}

### Find vocabulary size base on all data

In [60]:
# find vocab size by iterating over the dataset
all_data_loader = DataLoader(
    A3CDataset(episode_data, tokenizer=tokenizer, use_crop=False),
    batch_size=CFG.batch_size,
    shuffle=False,
)

for x, y in train_data_loader:
    # noop
    a = 2

vocab_size = len(tokenizer.mapper)
print(f"Vocab size: {vocab_size}")

Vocab size: 15


In [61]:
512 // 4

128

### NANO

In [62]:
# hyperparameters
batch_size = 64  # how many independent sequences will we process in parallel?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iters = 200
n_embd = 384
n_head = 8
n_layer = 8
dropout = 0.2


actions_demention = 5


class Head(nn.Module):
    """one head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer(
            "tril", torch.tril(torch.ones(CFG.block_size, CFG.block_size))
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)  # (B,T,hs)
        q = self.query(x)  # (B,T,hs)
        # compute attention scores ("affinities")
        wei = (
            q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
        )  # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """a simple linear layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(CFG.block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, actions_demention)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        try:
            tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        except:
            print(idx)
            print(idx.shape)
            print(targets)
            print(targets.shape)
            print(idx.shape)
        # tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B, C * T)
            # targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

14.391557 M parameters


### Solution with Claude help

In [63]:
class CFG:
    block_size = 512
    start_token = 0
    padding_token = 1
    end_token = 2
    batch_size = 64
    state_min, state_max = 0, 9  # Adjust based on your environment
    state_bins = 10
    reward_min, reward_max = -1, 1  # Adjust based on your environment
    reward_bins = 20
    action_dim = 5
    n_embd = 384
    n_head = 8
    n_layer = 8
    dropout = 0.2
    vocab_size = 0


class ADTokenizer:
    def __init__(self, cfg):
        self.cfg = cfg
        self.state_bins = np.linspace(cfg.state_min, cfg.state_max, cfg.state_bins)
        self.reward_bins = np.linspace(cfg.reward_min, cfg.reward_max, cfg.reward_bins)
        self.vocab_size = (
            cfg.state_bins * 2 + cfg.action_dim + cfg.reward_bins + 3
        )  # +3 for start, end, and pad tokens

    def discretize(self, value, bins):
        return np.digitize(value, bins)

    def encode(self, x):
        tokens = []
        for state_x, state_y, action, reward in x:
            tokens.extend(
                [
                    self.discretize(state_x, self.state_bins) + 3,
                    self.discretize(state_y, self.state_bins) + self.cfg.state_bins + 3,
                    action + self.cfg.state_bins * 2 + 3,
                    self.discretize(reward, self.reward_bins)
                    + self.cfg.state_bins * 2
                    + self.cfg.action_dim
                    + 3,
                ]
            )
        return [self.cfg.start_token] + tokens + [self.cfg.end_token]

    def pad(self, x):
        return x + [self.cfg.padding_token] * (self.cfg.block_size - len(x))

    def __call__(self, x):
        return self.pad(self.encode(x))


class A3CDataset(Dataset):
    def __init__(self, global_history, tokenizer, use_crop=True):
        self.global_history = global_history
        self.tokenizer = tokenizer
        self.use_crop = use_crop

    def __len__(self):
        return len(self.global_history)

    def crop(self, arr):
        if len(arr) > self.tokenizer.cfg.block_size // 4:
            arr = arr[: self.tokenizer.cfg.block_size // 4]
        take_first = np.random.randint(2, len(arr))
        target = arr[-1]
        arr = arr[: take_first - 1]
        return arr, target

    def __getitem__(self, idx):
        learning_history = self.global_history[idx]
        if self.use_crop:
            learning_history, target = self.crop(learning_history)
        learning_history = learning_history[: len(learning_history) - 1]
        # target = learning_history[-1]
        tokenized = self.tokenizer(learning_history)
        tensor = torch.tensor(tokenized)
        action = torch.tensor(target[2])
        return tensor, action


class ADTransformer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.token_embedding = nn.Embedding(cfg.vocab_size, cfg.n_embd)
        self.position_embedding = nn.Embedding(cfg.block_size, cfg.n_embd)
        self.blocks = nn.Sequential(
            *[Block(cfg.n_embd, cfg.n_head) for _ in range(cfg.n_layer)]
        )
        self.ln_f = nn.LayerNorm(cfg.n_embd)
        self.action_head = nn.Linear(cfg.n_embd, cfg.action_dim)

    def forward(self, idx):
        b, t = idx.size()
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(t, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        action_logits = self.action_head(
            x[:, -1, :]
        )  # Only predict for the last position
        return action_logits

    def predict_action(self, context):
        action_logits = self(context)
        return torch.argmax(action_logits, dim=-1)

In [64]:
tokenizer = ADTokenizer(CFG)

train_data_loader = DataLoader(
    A3CDataset(train_data, tokenizer=tokenizer),
    batch_size=CFG.batch_size,
    shuffle=True,
)
test_data_loader = DataLoader(
    A3CDataset(test_data, tokenizer=tokenizer), batch_size=CFG.batch_size, shuffle=False
)

CFG.vocab_size = tokenizer.vocab_size

model = ADTransformer(CFG).to(device)

In [24]:
EPOCHS = 20
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=EPOCHS * len(train_data_loader)
)

writer = SummaryWriter()

for epoch in range(EPOCHS):
    model.train()
    for j, (X, y) in tqdm(
        enumerate(train_data_loader),
        total=len(train_data_loader),
        desc=f"Epoch {epoch}",
    ):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(X)
        loss = F.cross_entropy(logits, y)
        writer.add_scalar("Loss/train", loss.item(), epoch * len(train_data_loader) + j)
        writer.add_scalar(
            "Learning rate",
            scheduler.get_last_lr()[0],
            epoch * len(train_data_loader) + j,
        )
        loss.backward()
        optimizer.step()
        scheduler.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X, y in test_data_loader:
            X, y = X.to(device), y.to(device)
            logits = model(X)
            val_loss += F.cross_entropy(logits, y).item()
    val_loss /= len(test_data_loader)
    writer.add_scalar("Loss/val", val_loss, epoch)

    print(f"Epoch {epoch}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")

writer.close()

Epoch 0: 100%|██████████| 76/76 [00:25<00:00,  2.94it/s]


Epoch 0: Train Loss: 0.4116, Val Loss: 0.4773


Epoch 1: 100%|██████████| 76/76 [00:25<00:00,  3.01it/s]


Epoch 1: Train Loss: 0.6964, Val Loss: 0.4726


Epoch 2: 100%|██████████| 76/76 [00:25<00:00,  2.94it/s]


Epoch 2: Train Loss: 0.5508, Val Loss: 0.4534


Epoch 3:  57%|█████▋    | 43/76 [00:14<00:11,  2.92it/s]


KeyboardInterrupt: 

In [25]:
# save the model
# torch.save(model.state_dict(), "model.pth")
model.load_state_dict(torch.load("model.pth"))

  model.load_state_dict(torch.load("model.pth"))


<All keys matched successfully>

In [37]:
from dark_room import DarkRoom
from time import sleep

from utils import print_grid

model = model.eval().cpu()
env = DarkRoom(size=9)
state = env.reset()


action_history = [[4.0, 4.0, 0, 0]]
global_reward = 0


def make_iteration(state, global_reward):
    tokenized = torch.tensor([tokenizer(action_history)])
    logits = model(tokenized)
    action = torch.argmax(logits, dim=1).item()
    #     action = torch.softmax(policy[0], dim=-1).argmax().item()
    # action = Categorical(policy).sample().item()
    state, reward, done = env.step(action)

    action_history.append([state[0], state[1], action, reward])

    return state, global_reward + reward, done

In [38]:
action_history

[[4.0, 4.0, 0, 0]]

In [46]:
state, global_reward, done = make_iteration(state, global_reward)
print_grid(env.render())

Goal reached
[H[J
[['G' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']]


In [30]:
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X, y in test_data_loader:
        X, y = X.to(device), y.to(device)
        logits = model(X)
        predicted = torch.argmax(logits, dim=1)
        total += y.size(0)
        correct += (predicted == y).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8344


In [None]:
### Training

In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
EPOCHS = 4000
linear_schedule = torch.optim.lr_scheduler.LambdaLR(
    optimizer, lambda i: min(1.0, i / (EPOCHS * len(train_data_loader)))
)
# tensorboard pytorch logging


writer = SummaryWriter()

# training loop
for i in range(EPOCHS):
    model.train()
    for j, (X, y) in tqdm(
        enumerate(train_data_loader),
        unit="batch",
        total=len(train_data_loader),
        desc=f"Epoch {i}",
    ):
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        logits, loss = model(X, y)
        writer.add_scalar("Loss/train", loss, time.time())
        writer.add_scalar("Learning rate", optimizer.param_groups[0]["lr"], time.time())
        writer.add_scalar("Epoch", i, time.time())
        loss.backward()
        linear_schedule.step()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        for i, (X, y) in tqdm(
            enumerate(test_data_loader),
            unit="batch",
            total=len(test_data_loader),
            desc=f"Epoch {i}",
        ):
            X = X.to(device)
            y = y.to(device)
            logits, loss = model(X, y)
            writer.add_scalar("Loss/val", loss, time.time())

            writer.add_scalar("Epoch", i, time.time())

writer.close()

Epoch 0: 100%|██████████| 76/76 [00:26<00:00,  2.82batch/s]
Epoch 0: 100%|██████████| 19/19 [00:02<00:00,  8.69batch/s]
Epoch 1: 100%|██████████| 76/76 [00:28<00:00,  2.71batch/s]
Epoch 1: 100%|██████████| 19/19 [00:02<00:00,  8.14batch/s]
Epoch 2: 100%|██████████| 76/76 [00:27<00:00,  2.72batch/s]
Epoch 2: 100%|██████████| 19/19 [00:02<00:00,  7.89batch/s]
Epoch 3: 100%|██████████| 76/76 [00:28<00:00,  2.66batch/s]
Epoch 3: 100%|██████████| 19/19 [00:02<00:00,  7.57batch/s]
Epoch 4: 100%|██████████| 76/76 [00:28<00:00,  2.63batch/s]
Epoch 4: 100%|██████████| 19/19 [00:02<00:00,  8.55batch/s]
Epoch 5: 100%|██████████| 76/76 [00:28<00:00,  2.71batch/s]
Epoch 5: 100%|██████████| 19/19 [00:02<00:00,  8.50batch/s]
Epoch 6: 100%|██████████| 76/76 [00:27<00:00,  2.78batch/s]
Epoch 6: 100%|██████████| 19/19 [00:02<00:00,  8.47batch/s]
Epoch 7: 100%|██████████| 76/76 [00:27<00:00,  2.79batch/s]
Epoch 7: 100%|██████████| 19/19 [00:02<00:00,  7.77batch/s]
Epoch 8: 100%|██████████| 76/76 [00:28<0

KeyboardInterrupt: 

In [21]:
from dark_room import DarkRoom
from time import sleep

from utils import print_grid

model.eval()
env = DarkRoom(size=9)
state = env.reset()


action_history = [[4.0, 4.0, 0, 0]]
global_reward = 0


def make_iteration(state, global_reward):
    tokenized = torch.tensor([tokenizer([state])]).to(device)
    policy = model(tokenized)
    action = torch.softmax(policy[0], dim=-1).argmax().item()
    # action = Categorical(policy).sample().item()
    state, reward, done = env.step(action)

    action_history.append([state[0], state[1], action, global_reward + reward])

    return state, global_reward + reward, done


# for i in range(1000):
#     state, reward, done = make_iteration(state)
#     sleep(0.1)
#     if done:
#         print(f"Goal reached in {i} steps")
#         print(f"Action history: {action_history}")
#         print_grid(env.render())
#         break

In [54]:
action_history

[[4.0, 4.0, 0, 0],
 [3, 4, 2, -1],
 [2, 4, 2, -2],
 [1, 4, 2, -3],
 [0, 4, 2, -4],
 [0, 4, 2, -5],
 [0, 4, 2, -6],
 [0, 4, 2, -7],
 [0, 4, 2, -8],
 [0, 4, 2, -9],
 [0, 4, 2, -10],
 [0, 4, 2, -11],
 [0, 4, 2, -12],
 [0, 4, 2, -13],
 [0, 4, 2, -14],
 [0, 4, 2, -15],
 [0, 4, 2, -16],
 [0, 4, 2, -17],
 [0, 4, 2, -18],
 [0, 4, 2, -19],
 [0, 4, 2, -20],
 [0, 4, 2, -21],
 [0, 4, 2, -22],
 [0, 4, 2, -23],
 [0, 4, 2, -24],
 [0, 4, 2, -25],
 [0, 4, 2, -26],
 [0, 4, 2, -27],
 [0, 4, 2, -28],
 [0, 4, 2, -29],
 [0, 4, 2, -30],
 [0, 4, 2, -31]]

In [121]:
state, global_reward, done = make_iteration(state, global_reward)
print_grid(env.render())

[H[J
[['X' ' ' 'G' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' ']]


In [99]:
env.step(0)

([0, 2], -1, False)