## IMPORTS

In [1]:
import os, sys, random, time
import warnings

import numpy as np
import torch
import tensorboard
import matplotlib.pyplot as plt

import gymnasium as gym
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

import imageio
from IPython.display import Image

## STUP

In [2]:
# seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed_all(SEED)

# Ensure deterministic behavior in CuDNN (NVIDIA backend for deep learning ops).
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Python:", sys.version.split()[0])
print("PyTorch:", torch.__version__)
print("Device:", device)
print("CUDA:", torch.version.cuda if torch.cuda.is_available() else "None")

Python: 3.11.14
PyTorch: 2.8.0
Device: cpu
CUDA: None


# Path

In [3]:
BASE_DIR = './project_results/'

# --- DQN Setup ---
DQN_DIR = os.path.join(BASE_DIR, 'DQN')
os.makedirs(DQN_DIR, exist_ok=True)
DQN_LOGS = os.path.join(DQN_DIR, "tensorboard")
DQN_MODEL = os.path.join(DQN_DIR, "model_dqn.zip")

# --- PPO Setup ---
PPO_DIR = os.path.join(BASE_DIR, 'PPO')
os.makedirs(PPO_DIR, exist_ok=True)
PPO_LOGS = os.path.join(PPO_DIR, "tensorboard")
PPO_MODEL = os.path.join(PPO_DIR, "model_ppo.zip")

print(f"Pastas criadas em: {os.path.abspath(BASE_DIR)}")

Pastas criadas em: c:\Users\tomfp\Documents\GitHub\ATD_02\project_results


## ENVIRONMENT

In [4]:
def make_env():
    """Factory function that creates a fresh CartPole environment."""
    return gym.make("LunarLander-v3", render_mode="rgb_array")

env = DummyVecEnv([make_env])

  from pkg_resources import resource_stream, resource_exists


## MODEL

In [5]:
## MODEL DQN
model_dqn = DQN(
    policy='MlpPolicy',
    env=env,
    learning_rate=1e-3,
    buffer_size=50_000,
    exploration_fraction=0.2,
    batch_size=64,
    gamma=0.99,

    verbose=1,
    seed=SEED,
    tensorboard_log=DQN_LOGS, 
    device=device,            

    # Hiperparâmetros explícitos para o relatório
    train_freq=4, 
    learning_starts=500,  
    target_update_interval=10_000,
)

Using cpu device


In [6]:
## MODEL PPO
model_ppo = PPO(
    policy='MlpPolicy',
    env=env,
    learning_rate=3e-4,      # Padrão robusto para PPO
    n_steps=2048,            # Tamanho da janela de experiência antes do update
    batch_size=64,           # Tamanho do sub-batch para otimização
    n_epochs=10,             # Épocas de otimização por update
    gamma=0.99,              # Fator de desconto (mesmo do DQN para justiça)
    gae_lambda=0.95,         # Suavização de vantagem (exclusivo PPO/Actor-Critic)
    clip_range=0.2,          # Clipping do PPO (estabilidade teórica)
    ent_coef=0.01,           # Incentivo à exploração via entropia
    
    verbose=1,
    seed=SEED,               # Mesma seed inicial
    tensorboard_log=PPO_LOGS,
    device=device
)

Using cpu device


## LOGGER

In [7]:
class DQNLoggingCallback(BaseCallback):
    """
    Custom callback for logging exploration-related metrics during training.

    This callback tracks:
    - The exploration rate (epsilon) over time
    - Episode rewards (one value per completed episode)
    - The number of gradient update steps performed by the agent

    The data collected here will later be used to plot training curves directly inside the notebook, complementing (but not replacing) TensorBoard.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        # list to store the metrics
        self.epsilon_history = []
        self.reward_history = []
        self.update_steps = []
        self.episode_reward = 0

    def _on_step(self) -> bool:
        # epsilon info
        if hasattr(self.model, "exploration_rate"):
            self.epsilon_history.append(self.model.exploration_rate)

        # reward info
        reward = self.locals.get("rewards")
        if reward is not None:
            self.episode_reward += reward[0]

        # check if episode has end (done)
        done = self.locals.get("dones")
        if done is not None and done[0]:
            self.reward_history.append(self.episode_reward)
            self.episode_reward = 0 # reset for next episode

        # number of gradient updates performed
        self.update_steps.append(self.model._n_updates)

        return True

## Train

In [8]:
## --- Treino do DQN ---
print("Treino do DQN...")
callback_dqn = DQNLoggingCallback()

model_dqn.learn(
    total_timesteps=100_000,
    callback=callback_dqn,
    progress_bar=True,
    tb_log_name="run_1"
)
model_dqn.save(DQN_MODEL)

Treino do DQN...
Logging to ./project_results/DQN\tensorboard\run_1_2


----------------------------------
| rollout/            |          |
|    exploration_rate | 0.982    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 5793     |
|    time_elapsed     | 0        |
|    total_timesteps  | 382      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3312     |
|    time_elapsed     | 0        |
|    total_timesteps  | 759      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 1.06     |
|    n_updates        | 64       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.947    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 2583     |
|    time_elapsed   

In [9]:
## --- Treino do PPO ---
print("\nTreino do PPO...")
callback_ppo = DQNLoggingCallback()

model_ppo.learn(
    total_timesteps=100_000,
    callback=callback_ppo,
    progress_bar=True,
    tb_log_name="run_1"
)
model_ppo.save(PPO_MODEL)


Treino do PPO...
Logging to ./project_results/PPO\tensorboard\run_1_1
-----------------------------
| time/              |      |
|    fps             | 2107 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1494         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0071839932 |
|    clip_fraction        | 0.016        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.000301     |
|    learning_rate        | 0.0003       |
|    loss                 | 1.03e+03     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00624     |
|    value_loss           | 1.7