# Value-based vision agent in the breakout environment using PyTorch

|        | TYPE                   | VALUES          | DESCRIPTION                                                                                                |
|--------|------------------------|-----------------|------------------------------------------------------------------------------------------------------------|
| Action Space | ndarray<br/>(1,) | {0, 1, 2, 3} | 0: No action<br/>1: Fire<br/>2: Right<br/>3: Left |
| Observation Space | ndarray<br/>(210,160) | <0, 255> | The game screen. |
| Reward |  | float | Reward given when brick is hit.<br/>Red: 7<br/>Orange: 7<br/>Yellow: 4<br/>Green: 4<br/>Aqua: 1<br/>Blue: 1 |
| Termination |  | boolean | If the ball is missed by the paddle. |

In [None]:
import copy
import time
import torch
from tqdm import tqdm
import gymnasium as gym

from DQN import VisionDeepQ
from utilities import visualisation

In [None]:
environment = gym.make('ALE/Breakout-v5', render_mode="rgb_array",
                       obs_type="grayscale", frameskip=1, repeat_action_probability=0.0)
environment.metadata["render_fps"] = 30

### Training

#### Parameters

|Parameter|Description|
|---------|-----------|
| SKIP | number of frames to skip between each saved frame |
| SHAPE | how to reshape the `original` image |
| DISCOUNT | discount rate for rewards |
| GAMMA | discount rate for Q-learning |
| GRADIENTS | clamp the gradients between these values (or None for no clamping) |
| PUNISHMENT | punishment for losing |
| INCENTIVE | incentive for rewards |
| EXPLORATION_RATE | initial exploration rate |
| EXPLORATION_MIN | minimum exploration rate |
| EXPLORATION_STEPS | number of games to decay exploration rate from `RATE` to `MIN` |
| MINIBATCH | size of the minibatch |
| TRAIN_EVERY | train the network every n games |
| START_TRAINING_AT | start training after n games |
| REMEMBER | only remember games with rewards, and this fraction of the games without |
| MEMORY | size of the agents internal memory |
| RESET_Q_EVERY | update target-network every n games |


Parameters based on [He-Ze](https://github.com/He-Ze/DQN-breakout/tree/main)

In [None]:
GAMES = 100000
SKIP = 4
CHECKPOINT = 10000

SHAPE = {
    "original": (1, 1, 210, 160),
    "max_pooling": 2,
}

DISCOUNT = 0.95
GAMMA = 0.99
GRADIENTS = (-1, 1)

PUNISHMENT = -10
INCENTIVE = 1

MINIBATCH = 32
TRAIN_EVERY = 4
START_TRAINING_AT = 250

EXPLORATION_RATE = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_STEPS = 10000 // TRAIN_EVERY

REMEMBER = 0.005
MIN_REWARD = lambda game: game/1000 if game <= 100000 else 500
MEMORY = 250
RESET_Q_EVERY = TRAIN_EVERY * 500

NETWORK = {
    "input_channels": 4, "outputs": 4,
    "channels": [32, 64, 64],
    "kernels": [8, 4, 3],
    "padding": ["valid", "valid", "valid"],
    "strides": [4, 2, 1],
    "nodes": [128],
}
OPTIMIZER = {
    "optimizer": torch.optim.Adam,
    "lr": 0.0000625,
    "hyperparameters": {"eps": 1.5e-4}
}

#### Initialisation

In [None]:
value_agent = VisionDeepQ(
    network=NETWORK, optimizer=OPTIMIZER, shape=SHAPE,

    batch_size=MINIBATCH, memory=MEMORY,

    discount=DISCOUNT, gamma=GAMMA,
    punishment=PUNISHMENT, incentive=INCENTIVE,

    exploration_rate=EXPLORATION_RATE,
    exploration_steps=EXPLORATION_STEPS,
    exploration_min=EXPLORATION_MIN,
)

_value_agent = copy.deepcopy(value_agent)

#### Training

In [None]:
start = time.time()
for game in tqdm(range(1, GAMES + 1), 
                 desc="Game", ncols=50, bar_format='%s{l_bar}{bar}|' % '\033[30m'):
    
    initial = value_agent.preprocess(environment.reset()[0])
    states = torch.cat([initial] * value_agent.shape["reshape"][1], dim=1)

    DONE = False
    STEPS = REWARDS = 0
    while not DONE:
        action, new_states, rewards, DONE = value_agent.observe(environment, states, SKIP)
        value_agent.remember(states, action, rewards)

        states = new_states
        REWARDS += rewards.item()
        STEPS += 1

    value_agent.memorize(states, STEPS) if REWARDS > MIN_REWARD(game) else None
    value_agent.memory["game"].clear()

    loss = value_agent.learn(network=_value_agent, clamp=GRADIENTS) \
        if (game % TRAIN_EVERY == 0 
            and len(value_agent.memory["memory"]) > 0 
            and game >= START_TRAINING_AT) \
        else None

    _value_agent.load_state_dict(value_agent.state_dict()) if game % RESET_Q_EVERY == 0 else None

print(f"Total training time: {round(time.time() - start, 2)} seconds")

#### Visualisation

##### In action

In [None]:
visualisation.gif(environment, value_agent, './dqn-breakout.gif', SKIP)

<img src="./dqn-breakout.gif" width="1000" height="1000" />

In [None]:
environment.close()