# Atlantis - Reinforcement Learning - PPO

In [None]:
import gymnasium as gym
from sb3_contrib import RecurrentPPO
import numpy as np
import os

In [None]:
class CustomAtlantisEnv(gym.Wrapper):
    def __init__(self, env):
        super(CustomAtlantisEnv, self).__init__(env)
        self.old_lives = 6

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        
        # Modify the reward function here
        custom_reward = self.custom_reward_function(obs, reward, done, info)
        
        return obs, custom_reward, done, truncated, info
    
    def custom_reward_function(self, obs, reward, done, info):

        if reward == 2000 or reward == 200:
            reward += 4000
        elif reward == 1000 or reward == 100:
            reward += 1000

        if self.old_lives > info['lives']:
            reward += -500
            self.old_lives = info['lives']

        reward += 2**(info['episode_frame_number']/3000)
        
        return reward

## Train and save the models with different timesteps

In [None]:
models_dir = "models/CUSTOM_RECURRENT"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="rgb_array", obs_type="grayscale")
env.reset()

# Wrap the environment with the custom wrapper
env = CustomAtlantisEnv(env)

# Initialize the model
model = RecurrentPPO('MlpLstmPolicy', env, verbose=0, device="cuda", tensorboard_log=logdir)

In [None]:
def print_stats(locals):
    print(f"Episode: {locals['iteration']}")
    print(f"Lives: {locals['infos'][0]['lives']}")
    print(f"Reward: {locals['infos'][0]['episode']['r']}")
    print(f"L: {locals['infos'][0]['episode']['l']}")
    print(f"T: {locals['infos'][0]['episode']['t']}")

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, verbose: int = 0):
        super().__init__(verbose)
        env = gym.make('ALE/Atlantis-v5', render_mode="rgb_array", obs_type="grayscale")
        env.reset()
        self.test_env = env
        self.counter = 0
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env # type: VecEnv
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # num_timesteps = n_envs * n times env.step() was called
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = {}  # type: Dict[str, Any]
        # self.globals = {}  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger # type: stable_baselines3.common.logger.Logger
        # Sometimes, for event callback, it is useful
        # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.episodes_stats = []

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: If the callback returns False, training is aborted early.
        """

        if self.locals['dones'][0]:
            #print_stats(self.locals)

            self.counter += 1

            if self.counter % 10 == 0:

                self.model.save(f"{models_dir}/model_custom_recurrent")

                model_copy = model.load(f"{models_dir}/model_custom_recurrent", env=self.test_env)

                vec_env = model_copy.get_env()
                obs = vec_env.reset()
                done = False
                count_rewards = 0
                while not done:
                    action, _states = model_copy.predict(obs)
                    
                    # Convert action to integer if it's in array form
                    if isinstance(action, np.ndarray):
                        action = action.item()
                    
                    obs, rewards, terminated, truncated, info = self.test_env.step(action)
                    done = terminated or truncated
                    count_rewards += rewards

                stat = {
                    "episode": self.locals['iteration'],
                    "lives": self.locals['infos'][0]['lives'],
                    "reward": count_rewards,
                    "l": self.locals['infos'][0]['episode']['l'],
                    "t": self.locals['infos'][0]['episode']['t']
                }

                print(f"Reward: {count_rewards}")

                self.episodes_stats.append(stat)
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        pass

In [None]:
custom = CustomCallback()

TIME_STEPS = 100000
MAX_ITERS = 10
iters = 0

while iters < MAX_ITERS:

    iters += 1
    model.learn(total_timesteps=TIME_STEPS, reset_num_timesteps=False, tb_log_name="CUSTOM_RECURRENT", callback=custom)
    
    # Save the model
    model.save(f"{models_dir}/{TIME_STEPS*iters}.zip")

env.close()

In [None]:
print(custom.episodes_stats)

import json

# Assuming custom.episodes_stats is a dictionary
data = custom.episodes_stats

# Specify the file name where you want to save the JSON data
file_name = "custom_recurrent_stats.json"

# Write the data to a JSON file
with open(file_name, "w") as json_file:
    json.dump(data, json_file)


## Test the different models

In [None]:
iters = 0

MAX_ITERS = 10
TIME_STEPS = 100000

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="human", obs_type="grayscale")
env.reset()

while iters < MAX_ITERS:

    iters += 1

    model_path = f"{models_dir}/{TIME_STEPS*iters}.zip"
    model = RecurrentPPO.load(model_path, env=env)

    print(f"Model: {model_path}")

    episodes = 1

    for ep in range(episodes):
        print(f"Episode: {ep}")
        vec_env = model.get_env()
        obs = vec_env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs)
            
            # Convert action to integer if it's in array form
            if isinstance(action, np.ndarray):
                action = action.item()
            
            obs, rewards, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            env.render()

    env.close()