# Atlantis - Reinforcement Learning - PPO

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO
import numpy as np
import os

## Train and save the models with different timesteps

In [None]:
models_dir = "models/PPO"
logdir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="rgb_array")
env.reset()

# Initialize the model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=logdir)

TIME_STEPS = 100000
MAX_ITERS = 10
iters = 0

while iters < MAX_ITERS:
    iters += 1
    model.learn(total_timesteps=TIME_STEPS, reset_num_timesteps=False, tb_log_name="PPO")
    model.save(f"{models_dir}/{TIME_STEPS*iters}")
    print(f"Model {TIME_STEPS*iters} saved")

## Test the different models

In [None]:
iters = 0

# Create the environment
env = gym.make('ALE/Atlantis-v5', render_mode="human")
env.reset()

while iters < MAX_ITERS:

    iters += 1

    model_path = f"{models_dir}/{TIME_STEPS*iters}.zip"
    model = PPO.load(model_path, env=env)

    episodes = 5

    for ep in range(episodes):
        vec_env = model.get_env()
        obs = vec_env.reset()
        done = False
        while not done:
            action, _states = model.predict(obs)
            
            # Convert action to integer if it's in array form
            if isinstance(action, np.ndarray):
                action = action.item()
            
            obs, rewards, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            env.render()

    env.close()