# Value-based agent in the tetris environment using PyTorch

|        | TYPE                   | VALUES          | DESCRIPTION                                                                                                |
|--------|------------------------|-----------------|------------------------------------------------------------------------------------------------------------|
| Action Space | ndarray<br/>(1,) | {0, 1, 2, 3, 4} | Action to manipulate the current tile.<br/>0: No action<br/>1: Rotate<br/>2: Right<br/>3: Left<br/>4: Down |
| Observation Space | ndarray<br/>(128,) | <0, 255> | The game screen. (RAM version.)   |
| Reward |  | float | Reward given when a row is filled.<br/>Single: 1<br/>Double: 3<br/>Triple: 8<br/>Quadruple: 18       |
| Termination |  | boolean | The game ends when the pieces stack up to the top of the playing field.                                    |

In [None]:
import sys
import copy
import time
import torch
from tqdm import tqdm
import gymnasium as gym
import matplotlib.pyplot as plt

sys.path.append("../")

from agent_basic_continuous import DeepQ
from _helpers.plotting import visualise_dict  # noqa
from _helpers.gif import gif  # noqa

In [None]:
environment = gym.make('ALE/Tetris-ram-v5', render_mode="rgb_array", 
                       obs_type="ram", frameskip=4, repeat_action_probability=0.25)

### Training

#### Parameters

In [None]:
GAMES = 1000

GAMMA = 0.99

EXPLORATION_RATE = 1.0
EXPLORATION_DECAY = 0.995
EXPLORATION_MIN = 0.01

MINIBATCH = 64
TRAIN_EVERY = 10

MEMORY = 1500
RESET_Q_EVERY = 250

NETWORK = {"inputs": 128, "outputs": 5, "nodes": [64, 32]}
OPTIMIZER = {"optimizer": torch.optim.RMSprop, "lr": 0.0025}

#### Initialisation

In [None]:
value_agent = DeepQ(
    network=NETWORK, optimizer=OPTIMIZER,
    gamma=GAMMA, batch_size=MINIBATCH, memory=MEMORY,
    exploration_rate=EXPLORATION_RATE, exploration_decay=EXPLORATION_DECAY, exploration_min=EXPLORATION_MIN
)

_value_agent = copy.deepcopy(value_agent)

In [None]:
checkpoint = GAMES // 10
metrics = {
    "steps": torch.zeros(GAMES),
    "losses": torch.zeros(GAMES // TRAIN_EVERY),
    "exploration": torch.zeros(GAMES),
    "rewards": torch.zeros(GAMES)
}

#### Training

In [None]:
start = time.time()
for game in tqdm(range(1, GAMES + 1), 
                 desc="Game", ncols=50, bar_format='%s{l_bar}{bar}|' % '\033[30m'):
    
    state = torch.tensor(environment.reset()[0], dtype=torch.float32).view(-1)
    terminated = truncated = False
    
    # LEARNING FROM GAME
    # ----------------------------------------------------------------------------------------------
    
    steps = 0
    rewards = 0
    while not (terminated or truncated):
        action = value_agent.action(state)
        
        new_state, reward, terminated, truncated, _ = environment.step(action.item())
        new_state = torch.tensor(new_state, dtype=torch.float32).view(-1)
        
        value_agent.remember(state, action, new_state, torch.tensor([reward]))
        state = new_state
        
        steps += 1
        rewards += reward
    value_agent.memorize(steps)

    if game % TRAIN_EVERY == 0:
        loss = value_agent.learn(network=_value_agent)
        metrics["losses"][game // TRAIN_EVERY - 1] = loss
    
    if game % RESET_Q_EVERY == 0:
        _value_agent.load_state_dict(value_agent.state_dict())

    # METRICS
    # ----------------------------------------------------------------------------------------------
    
    metrics["steps"][game-1] = steps
    metrics["exploration"][game-1] = value_agent.explore["rate"]
    metrics["rewards"][game-1] = rewards
    
    if game % checkpoint == 0 or game == GAMES:
        _mean_steps = metrics["steps"][max(0, game-checkpoint-1):game-1].mean()
        _mean_loss = metrics["losses"][max(0, (game-checkpoint-1) 
                                           // TRAIN_EVERY):game // TRAIN_EVERY].mean()
        _total_rewards = metrics["rewards"][max(0, game-checkpoint-1):game-1].sum()
        
        print(f"Game {game:>6} {int(game/GAMES * 100):>16} % \n"
              f"{'-'*30} \n"
              f" > Average steps: {int(_mean_steps):>12} \n"
              f" > Average loss: {_mean_loss:>13.4f} \n"
              f" > Total rewards: {int(_total_rewards):>12} \n ")
        
print(f"Total training time: {time.time()-start:.2f} seconds")

#### Visualisation

##### Metrics

In [None]:
visualise_dict(metrics, title="Value-based: deep Q-learning agent")
plt.savefig("./output/value-elaborate-tetris.png")
plt.show()

##### In action

In [None]:
gif(environment, value_agent, "./output/value-elaborate-tetris.gif")

<img src="./output/value-elaborate-tetris.gif" width="1000" height="1000" />

In [None]:
environment.close()