# Value-based vision agent in the tetris environment using PyTorch

|        | TYPE                   | VALUES          | DESCRIPTION                                                                                                |
|--------|------------------------|-----------------|------------------------------------------------------------------------------------------------------------|
| Action Space | ndarray<br/>(1,) | {0, 1, 2, 3, 4} | Action to manipulate the current tile.<br/>0: No action<br/>1: Rotate<br/>2: Right<br/>3: Left<br/>4: Down |
| Observation Space | ndarray<br/>(210,160) | <0, 255> | The game screen. |
| Reward |  | float | Reward given when a row is filled.<br/>Single: 1<br/>Double: 3<br/>Triple: 8<br/>Quadruple: 18       |
| Termination |  | boolean | The game ends when the pieces stack up to the top of the playing field.                                    |

In [None]:
import copy
import time
import torch
import imageio
import gymnasium as gym

from agent_image import VisionDeepQ

In [None]:
environment = gym.make('ALE/Tetris-v5', render_mode="rgb_array", 
                       obs_type="grayscale", frameskip=4, repeat_action_probability=0.25)
environment.metadata["render_fps"] = 30

### Training

#### Parameters

|Parameter|Description|
|---------|-----------|
| SHAPE | input shape of the network (batch, channels, height, width) |
| DISCOUNT | discount rate for rewards |
| GAMMA | discount rate for Q-learning |
| PUNISHMENT | punishment for losing |
| INCENTIVE | incentive for rewards |
| EXPLORATION_RATE | initial exploration rate |
| EXPLORATION_MIN | minimum exploration rate |
| EXPLORATION_STEPS | number of games to decay exploration rate from `RATE` to `MIN` |
| MINIBATCH | size of the minibatch |
| TRAIN_EVERY | train the network every `n` games |
| START_TRAINING_AT | start training after n games |
| MEMORY | size of the agents internal memory |
| RESET_Q_EVERY | update target-network every `n` games |

In [None]:
GAMES = 100
CHECKPOINT = 10

SHAPE = (1, 1, 210, 160)
RESHAPE = (1, 1, 203-27, 64-22)  # See method `agent.preprocess` for more information.

DISCOUNT = 0.95
GAMMA = 0.99

PUNISHMENT = -1
INCENTIVE = 1

MINIBATCH = 16
TRAIN_EVERY = 10
START_TRAINING_AT = 90

EXPLORATION_RATE = 1.0
EXPLORATION_MIN = 0.001
EXPLORATION_STEPS = 2000 // TRAIN_EVERY

MEMORY = 250
RESET_Q_EVERY = TRAIN_EVERY * 5

NETWORK = {
    "input_channels": 1, "outputs": 5,
    "channels": [32, 32],
    "kernels": [3, 5],
    "padding": ["same", "same"],
}
OPTIMIZER = {
    "optimizer": torch.optim.Adam,
    "lr": 0.001,
    "hyperparameters": {}
}

#### Initialisation

In [None]:
value_agent = VisionDeepQ(
    network=NETWORK, optimizer=OPTIMIZER,

    batch_size=MINIBATCH, shape=RESHAPE,

    other={
        "discount": DISCOUNT, "gamma": GAMMA,

        "memory": MEMORY,

        "incentive": INCENTIVE, "punishment": PUNISHMENT,

        "exploration_rate": EXPLORATION_RATE,
        "exploration_steps": EXPLORATION_STEPS,
        "exploration_min": EXPLORATION_MIN
    }
)

_value_agent = copy.deepcopy(value_agent)

#### Training

In [None]:
start = time.time()
for game in range(1, GAMES + 1):
    state = value_agent.preprocess(environment.reset()[0], SHAPE)

    STEPS = REWARDS = 0
    TERMINATED = TRUNCATED = False
    while not (TERMINATED or TRUNCATED):
        action = value_agent.action(state).detach()

        new_state, reward, TERMINATED, TRUNCATED, _ = environment.step(action.item())
        new_state = value_agent.preprocess(new_state, SHAPE)

        value_agent.remember(state, action, torch.tensor([reward]))

        state = new_state
        REWARDS += reward
        STEPS += 1

    value_agent.memorize(state, STEPS) if REWARDS > 0 else None
    value_agent.memory["game"].clear()

    loss = value_agent.learn(network=_value_agent) \
        if (game % TRAIN_EVERY == 0 and len(value_agent.memory["memory"]) > 0) \
        else None

    _value_agent.load_state_dict(value_agent.state_dict()) if game % RESET_Q_EVERY == 0 else None

print(f"Total training time: {round(time.time() - start, 2)} seconds")

#### Visualisation

##### In action

In [None]:
state = value_agent.preprocess(environment.reset()[0], SHAPE)

images = []
TERMINATED = TRUNCATED = False
while not (TERMINATED or TRUNCATED):
    action = value_agent(state).argmax(1).item()

    state, reward, TERMINATED, TRUNCATED, _ = environment.step(action)
    state = value_agent.preprocess(state, SHAPE)

    images.append(environment.render())
_ = imageio.mimsave('./output/value-image-tetris.gif', images, duration=25)

<img src="./output/value-image-tetris.gif" width="1000" height="1000" />

In [None]:
environment.close()