# Atlantis - Reinforcement Learning - A2C

In [1]:
import gymnasium as gym
from stable_baselines3 import A2C
import numpy as np
import os

## Train and save the models with different timesteps

In [7]:
models_dir = "models/A2C"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="rgb_array")
env.reset()

# Initialize the model
model = A2C('MlpPolicy', env, verbose=1, device="cuda")



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [8]:
from stable_baselines3.common.callbacks import BaseCallback


class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, verbose: int = 0):
        super().__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env # type: VecEnv
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # num_timesteps = n_envs * n times env.step() was called
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = {}  # type: Dict[str, Any]
        # self.globals = {}  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger # type: stable_baselines3.common.logger.Logger
        # Sometimes, for event callback, it is useful
        # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.episodes_stats = []

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: If the callback returns False, training is aborted early.
        """

        if self.locals['dones'][0]:
            print(f"Episode: {self.locals['iteration']}")
            print(f"Lives: {self.locals['infos'][0]['lives']}")
            print(f"Reward: {self.locals['infos'][0]['episode']['r']}")
            print(f"L: {self.locals['infos'][0]['episode']['l']}")
            print(f"T: {self.locals['infos'][0]['episode']['t']}")

            stat = {
                "episode": self.locals['iteration'],
                "lives": self.locals['infos'][0]['lives'],
                "reward": self.locals['infos'][0]['episode']['r'],
                "l": self.locals['infos'][0]['episode']['l'],
                "t": self.locals['infos'][0]['episode']['t']
            }

            self.episodes_stats.append(stat)
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass

In [9]:
custom = CustomCallback()

TIME_STEPS = 100000
MAX_ITERS = 10
iters = 0

while iters < MAX_ITERS:
    iters += 1
    model.learn(total_timesteps=TIME_STEPS, reset_num_timesteps=False, callback=custom)
    
    model.save(f"{models_dir}/A2C_Atlantis_{iters}")

# store the stats
import json
with open(f"{logdir}/A2C_Atlantis.json", 'w') as f:
    json.dump(custom.episodes_stats, f)
    

-------------------------------------
| time/                 |           |
|    fps                | 63        |
|    iterations         | 100       |
|    time_elapsed       | 7         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.36     |
|    explained_variance | -8.82e-06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -0.0224   |
|    value_loss         | 0.000327  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 59        |
|    iterations         | 200       |
|    time_elapsed       | 16        |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.15     |
|    explained_variance | -2.32e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 199       |
|    policy_loss        | -0.00116  |
|    value_l

## Test the different models

In [5]:
iters = 0

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="human")
env.reset()

while iters < MAX_ITERS:

    iters += 1

    model_path = f"{models_dir}/{TIME_STEPS*iters}.zip"
    model = A2C.load(model_path, env=env)

    episodes = 5

    for ep in range(episodes):
        vec_env = model.get_env()
        obs = vec_env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs)
            
            # Convert action to integer if it's in array form
            if isinstance(action, np.ndarray):
                action = action.item()
            
            obs, rewards, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            env.render()

    env.close()

NameError: name 'MAX_ITERS' is not defined