# Value-based agent in the tetris environment using PyTorch

|        | TYPE                   | VALUES          | DESCRIPTION                                                                                                |
|--------|------------------------|-----------------|------------------------------------------------------------------------------------------------------------|
| Action Space | ndarray<br/>(1,) | {0, 1, 2, 3, 4} | Action to manipulate the current tile.<br/>0: No action<br/>1: Rotate<br/>2: Right<br/>3: Left<br/>4: Down |
| Observation Space | ndarray<br/>(128,) | <0, 255> | The game screen. (RAM version.)   |
| Reward |  | float | Reward given when a row is filled.<br/>Single: 1<br/>Double: 3<br/>Triple: 8<br/>Quadruple: 18       |
| Termination |  | boolean | The game ends when the pieces stack up to the top of the playing field.                                    |

In [None]:
import copy
import time
import torch
from tqdm import tqdm
import gymnasium as gym

from DQN import DeepQ
from utilities import visualisation

In [None]:
environment = gym.make('ALE/Tetris-ram-v5', render_mode="rgb_array", 
                       obs_type="ram", frameskip=1, repeat_action_probability=0.0)
environment.metadata["render_fps"] = 30

### Training

#### Parameters

|Parameter|Description|
|---------|-----------|
| SKIP | number of frames to skip between each saved frame |
| SHAPE | how to reshape the `original` image |
| DISCOUNT | discount rate for rewards |
| GAMMA | discount rate for Q-learning |
| GRADIENTS | clamp the gradients between these values (or None for no clamping) |
| PUNISHMENT | punishment for losing |
| INCENTIVE | incentive for rewards |
| EXPLORATION_RATE | initial exploration rate |
| EXPLORATION_MIN | minimum exploration rate |
| EXPLORATION_STEPS | number of games to decay exploration rate from `RATE` to `MIN` |
| MINIBATCH | size of the minibatch |
| TRAIN_EVERY | train the network every n games |
| START_TRAINING_AT | start training after n games |
| REMEMBER | only remember games with rewards, and this fraction of the games without |
| MEMORY | size of the agents internal memory |
| RESET_Q_EVERY | update target-network every n games |

In [None]:
GAMES = 25000
SKIP = 4
CHECKPOINT = 5000

DISCOUNT = 0.99
GAMMA = 0.99
GRADIENTS = (-1, 1)

PUNISHMENT = -1
INCENTIVE = 1

MINIBATCH = 64
TRAIN_EVERY = 4
START_TRAINING_AT = 1000

EXPLORATION_RATE = 1.0
EXPLORATION_MIN = 0.001
EXPLORATION_STEPS = 10000 // TRAIN_EVERY

REMEMBER = 0.005
MEMORY = 500
RESET_Q_EVERY = TRAIN_EVERY * 50

NETWORK = {"inputs": 128, "outputs": 5, "nodes": [512, 256]}
OPTIMIZER = {"optimizer": torch.optim.RMSprop, "lr": 0.0025}

#### Initialisation

In [None]:
value_agent = DeepQ(
    network=NETWORK, optimizer=OPTIMIZER,

    batch_size=MINIBATCH, memory=MEMORY,

    discount=DISCOUNT, gamma=GAMMA,
    punishment=PUNISHMENT, incentive=INCENTIVE,

    exploration_rate=EXPLORATION_RATE,
    exploration_steps=EXPLORATION_STEPS,
    exploration_min=EXPLORATION_MIN,
)

_value_agent = copy.deepcopy(value_agent)

#### Training

In [None]:
start = time.time()
for game in tqdm(range(1, GAMES + 1), 
                 desc="Game", ncols=50, bar_format='%s{l_bar}{bar}|' % '\033[30m'):
    
    state = value_agent.preprocess(environment.reset()[0])

    DONE = False
    STEPS = REWARDS = 0
    while not DONE:
        action, new_state, rewards, DONE = value_agent.observe(environment, state, SKIP)
        value_agent.remember(state, action, rewards)

        state = new_state
        REWARDS += rewards.item()
        STEPS += 1

    value_agent.memorize(state, STEPS) if REWARDS > 0 else None
    value_agent.memory["game"].clear()

    loss = value_agent.learn(network=_value_agent, clamp=GRADIENTS) \
        if (game % TRAIN_EVERY == 0 
            and len(value_agent.memory["memory"]) > 0 
            and game >= START_TRAINING_AT) \
        else None

    _value_agent.load_state_dict(value_agent.state_dict()) if game % RESET_Q_EVERY == 0 else None

print(f"Total training time: {round(time.time() - start, 2)} seconds")

#### Visualisation

##### In action

In [None]:
visualisation.gif(environment, value_agent, './dqn-breakout.gif', SKIP)

<img src="./dqn-breakout.gif" width="1000" height="1000" />

In [None]:
environment.close()