# Value-based vision agent in the tetris environment using PyTorch

|        | TYPE                   | VALUES          | DESCRIPTION                                                                                                |
|--------|------------------------|-----------------|------------------------------------------------------------------------------------------------------------|
| Action Space | ndarray<br/>(1,) | {0, 1, 2, 3, 4} | Action to manipulate the current tile.<br/>0: No action<br/>1: Rotate<br/>2: Right<br/>3: Left<br/>4: Down |
| Observation Space | ndarray<br/>(210,160) | <0, 255> | The game screen. |
| Reward |  | float | Reward given when a row is filled.<br/>Single: 1<br/>Double: 3<br/>Triple: 8<br/>Quadruple: 18       |
| Termination |  | boolean | The game ends when the pieces stack up to the top of the playing field.                                    |

In [None]:
import sys
import copy
import time
import torch
import imageio
from tqdm import tqdm
import gymnasium as gym

sys.path.append("../")

from agent_image import VisionDeepQ
from _helpers.plotting import visualise_csv  # noqa
from _helpers.gif import gif_stack  # noqa

In [None]:
environment = gym.make('ALE/Tetris-v5', render_mode="rgb_array", 
                       obs_type="grayscale", frameskip=1, repeat_action_probability=0.0)
environment.metadata["render_fps"] = 30

### Training

#### Parameters

|Parameter|Description|
|---------|-----------|
| SHAPE | input shape of the network (batch, channels, height, width) |
| DISCOUNT | discount rate for rewards |
| GAMMA | discount rate for Q-learning |
| PUNISHMENT | punishment for losing |
| INCENTIVE | incentive for rewards |
| EXPLORATION_RATE | initial exploration rate |
| EXPLORATION_MIN | minimum exploration rate |
| EXPLORATION_STEPS | number of games to decay exploration rate from `RATE` to `MIN` |
| MINIBATCH | size of the minibatch |
| TRAIN_EVERY | train the network every `n` games |
| START_TRAINING_AT | start training after n games |
| REMEMBER | only remember games with reward, and this fraction of the games without |
| MEMORY | size of the agents internal memory |
| RESET_Q_EVERY | update target-network every `n` games |

In [None]:
GAMES = 100000 
CHECKPOINT = 5000

SHAPE = (1, 1, 210, 160)
RESHAPE = (1, 4, 88, 21)

DISCOUNT = 0.95
GAMMA = 0.99

PUNISHMENT = -10
INCENTIVE = 10

MINIBATCH = 16
TRAIN_EVERY = 25
START_TRAINING_AT = 2000

EXPLORATION_RATE = 1.0
EXPLORATION_MIN = 0.001
EXPLORATION_STEPS = 30000 // TRAIN_EVERY

REMEMBER = 0.005
MEMORY = 100
RESET_Q_EVERY = TRAIN_EVERY * 150

NETWORK = {
    "input_channels": 4, "outputs": 5,
    "channels": [64, 32],
    "kernels": [2, 3],
    # "strides": [1, 2],
    "padding": ["valid", "same"],
    "nodes": [32],
}
OPTIMIZER = {
    "optimizer": torch.optim.Adam,
    "lr": 0.001,
    "hyperparameters": {}
}

#### Initialisation

In [None]:
value_agent = VisionDeepQ(
    network=NETWORK, optimizer=OPTIMIZER,

    batch_size=MINIBATCH, shape=RESHAPE,

    memory=MEMORY,

    discount=DISCOUNT, gamma=GAMMA,

    punishment=PUNISHMENT, incentive=INCENTIVE,

    exploration_rate=EXPLORATION_RATE,
    exploration_steps=EXPLORATION_STEPS,
    exploration_min=EXPLORATION_MIN,
)

_value_agent = copy.deepcopy(value_agent)

#### Training

In [None]:
start = time.time()
for game in tqdm(range(1, GAMES + 1), 
                 desc="Game", ncols=50, bar_format='%s{l_bar}{bar}|' % '\033[30m'):
    
    initial = value_agent.preprocess(environment.reset()[0], SHAPE)
    states = torch.cat([initial] * RESHAPE[1], dim=1)

    DONE = False
    STEPS = REWARDS = 0
    while not DONE:
        action, new_states, rewards, DONE = value_agent.observe(environment, states, SHAPE)
        value_agent.remember(states, action, rewards)

        states = new_states
        REWARDS += rewards.item()
        STEPS += 1

    value_agent.memorize(states, STEPS) if REWARDS > 0 else None
    value_agent.memory["game"].clear()

    loss = value_agent.learn(network=_value_agent) \
        if (game % TRAIN_EVERY == 0 and len(value_agent.memory["memory"]) > 0) \
        else None

    _value_agent.load_state_dict(value_agent.state_dict()) if game % RESET_Q_EVERY == 0 else None

print(f"Total training time: {round(time.time() - start, 2)} seconds")

#### Visualisation

##### In action

In [None]:
gif_stack(environment, value_agent, './output/value-image-tetris.gif')

<img src="./output/value-image-tetris.gif" width="1000" height="1000" />

In [None]:
environment.close()