## IMPORTS

In [5]:
import os
import sys
import random
import time
import warnings
import json

import numpy as np
import torch
import matplotlib.pyplot as plt

import gymnasium as gym
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback


## BASE SETUP

In [6]:
SEEDS = [42, 67, 136]
TOTAL_TIMESTEPS = 100_000

BASE_DIR = "./project_results"

# --- DQN ---
DQN_DIR = os.path.join(BASE_DIR, "DQN")
DQN_LOGS = os.path.join(DQN_DIR, "tensorboard")
os.makedirs(DQN_DIR, exist_ok=True)

# --- PPO ---
PPO_DIR = os.path.join(BASE_DIR, "PPO")
PPO_LOGS = os.path.join(PPO_DIR, "tensorboard")
os.makedirs(PPO_DIR, exist_ok=True)

print(f"Resultados serão guardados em: {os.path.abspath(BASE_DIR)}")

Resultados serão guardados em: c:\Users\tomfp\Documents\GitHub\ATD_02\project_results


## FUNCTIONS

In [7]:
class DQNLoggingCallback(BaseCallback):
    """
    Logs epsilon, episodic reward and number of updates during DQN training.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.epsilon_history = []
        self.reward_history = []
        self.update_steps = []
        self.episode_reward = 0.0

    def _on_step(self) -> bool:
        # Exploration rate (epsilon)
        if hasattr(self.model, "exploration_rate"):
            self.epsilon_history.append(self.model.exploration_rate)

        # Reward
        reward = self.locals.get("rewards")
        if reward is not None:
            self.episode_reward += reward[0]

        # End of episode
        done = self.locals.get("dones")
        if done is not None and done[0]:
            self.reward_history.append(self.episode_reward)
            self.episode_reward = 0.0

        # Gradient updates
        self.update_steps.append(self.model._n_updates)
        return True
    
class PPOLoggingCallback(BaseCallback):
    """
    Logs entropy, episodic reward, policy loss and value loss during PPO training.
    """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.entropy_history = []
        self.policy_loss_history = []
        self.value_loss_history = []
        self.reward_history = []
        self.episode_reward = 0.0

    def _on_step(self) -> bool:
        # Reward
        reward = self.locals.get("rewards")
        if reward is not None:
            self.episode_reward += reward[0]

        # End of episode
        done = self.locals.get("dones")
        if done is not None and done[0]:
            self.reward_history.append(self.episode_reward)
            self.episode_reward = 0.0

        # Entropy
        entropy = self.locals.get("ent_coef")  # coef * entropy
        if entropy is not None:
            self.entropy_history.append(entropy)

        # Policy and value loss
        if "loss" in self.locals:
            loss = self.locals["loss"]
            # PPO returns a dict with keys: 'policy_loss', 'value_loss', 'entropy_loss' sometimes
            # We'll try to log if available
            if isinstance(loss, dict):
                self.policy_loss_history.append(loss.get("policy_loss", np.nan))
                self.value_loss_history.append(loss.get("value_loss", np.nan))

        return True

def setup(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print("Python:", sys.version.split()[0])
    print("PyTorch:", torch.__version__)
    print("Device:", device)
    print("CUDA:", torch.version.cuda if torch.cuda.is_available() else "None")
    print("")

def make_env(seed: int):
    def _init():
        env = gym.make("LunarLander-v3")
        env.reset(seed=seed)
        return env
    return _init


def evaluate_agent(model, env_fn, n_episodes=20, render=False):
    """
    Avalia um agente em n_episodes determinísticos.
    Retorna rewards e comprimentos de episódios.
    """
    rewards = []
    lengths = []

    env = env_fn()
    for ep in range(n_episodes):
        obs, _ = env.reset()
        done = False
        total_reward = 0.0
        steps = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            steps += 1
            done = terminated or truncated

            if render:
                env.render()

        rewards.append(total_reward)
        lengths.append(steps)

    env.close()
    return np.array(rewards), np.array(lengths)

## TRAIN LOOP

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dqn_eval_results = {}
ppo_eval_results = {}

for seed in SEEDS:
    print(f"\n================= SEED {seed} =================\n")
    setup(seed)

    # -----------------------
    # DQN
    # -----------------------
    print(f"\n>>> Iniciando treino DQN com seed {seed} <<<\n")
    dqn_env = DummyVecEnv([make_env(seed)])
    dqn_callback = DQNLoggingCallback()

    model_dqn = DQN(
        policy="MlpPolicy",
        env=dqn_env,
        learning_rate=1e-3,
        buffer_size=50_000,
        exploration_fraction=0.2,
        batch_size=64,
        gamma=0.99,
        train_freq=4,
        learning_starts=500,
        target_update_interval=10_000,
        verbose=1,
        seed=seed,
        tensorboard_log=DQN_LOGS,
        device=device
    )

    model_dqn.learn(
        total_timesteps=TOTAL_TIMESTEPS,
        callback=dqn_callback,
        tb_log_name=f"dqn_seed_{seed}",
        progress_bar=True
    )

    model_dqn.save(os.path.join(DQN_DIR, f"model_dqn_seed_{seed}"))
    dqn_env.close()

    # --- Avaliação DQN ---
    rewards, lengths = evaluate_agent(model_dqn, make_env(seed), n_episodes=20)
    dqn_eval_results[seed] = {
        "mean_reward": rewards.mean(),
        "std_reward": rewards.std(),
        "mean_length": lengths.mean(),
        "std_length": lengths.std()
    }
    print(f"DQN Seed {seed}: Mean Reward = {rewards.mean():.2f} ± {rewards.std():.2f}, "
          f"Mean Length = {lengths.mean():.1f} ± {lengths.std():.1f}")

    # -----------------------
    # PPO
    # -----------------------
    print(f"\n>>> Iniciando treino PPO com seed {seed} <<<\n")
    ppo_env = DummyVecEnv([make_env(seed)])
    ppo_callback = PPOLoggingCallback()

    model_ppo = PPO(
        policy="MlpPolicy",
        env=ppo_env,
        learning_rate=3e-4,
        n_steps=2048,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=0.01,
        verbose=1,
        seed=seed,
        tensorboard_log=PPO_LOGS,
        device=device
    )

    model_ppo.learn(
        total_timesteps=TOTAL_TIMESTEPS,
        tb_log_name=f"ppo_seed_{seed}",
        callback=ppo_callback,
        progress_bar=True
    )

    model_ppo.save(os.path.join(PPO_DIR, f"model_ppo_seed_{seed}"))
    ppo_env.close()

print("\n✅ Treino finalizado para DQN e PPO em 3 seeds independentes!")

# Salvar resultados DQN
with open(os.path.join(DQN_DIR, "eval_results.json"), "w") as f:
    json.dump(dqn_eval_results, f, indent=4)

# Salvar resultados PPO
with open(os.path.join(PPO_DIR, "eval_results.json"), "w") as f:
    json.dump(ppo_eval_results, f, indent=4)

print("✅ Resultados de avaliação guardados em JSON")




Python: 3.11.14
PyTorch: 2.8.0
Device: cpu
CUDA: None


>>> Iniciando treino DQN com seed 42 <<<

Using cpu device
Logging to ./project_results\DQN\tensorboard\dqn_seed_42_3
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.982    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10123    |
|    time_elapsed     | 0        |
|    total_timesteps  | 382      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4016     |
|    time_elapsed     | 0        |
|    total_timesteps  | 759      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 1.06     |
|    n_updates        | 64       |
----------------------------------
----------------------------------
| rollout/         

DQN Seed 42: Mean Reward = -93.31 ± 18.81, Mean Length = 728.2 ± 373.3

>>> Iniciando treino PPO com seed 42 <<<

Using cpu device
Logging to ./project_results\PPO\tensorboard\ppo_seed_42_3
-----------------------------
| time/              |      |
|    fps             | 2454 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1604         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0069905594 |
|    clip_fraction        | 0.00557      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.000897     |
|    learning_rate        | 0.0003       |
|    loss                 | 658          



Python: 3.11.14
PyTorch: 2.8.0
Device: cpu
CUDA: None


>>> Iniciando treino DQN com seed 67 <<<

Using cpu device
Logging to ./project_results\DQN\tensorboard\dqn_seed_67_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10064    |
|    time_elapsed     | 0        |
|    total_timesteps  | 406      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4416     |
|    time_elapsed     | 0        |
|    total_timesteps  | 756      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.864    |
|    n_updates        | 63       |
----------------------------------
----------------------------------
| rollout/         

DQN Seed 67: Mean Reward = -134.76 ± 53.27, Mean Length = 758.7 ± 369.5

>>> Iniciando treino PPO com seed 67 <<<

Using cpu device
Logging to ./project_results\PPO\tensorboard\ppo_seed_67_2
-----------------------------
| time/              |      |
|    fps             | 2729 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1792        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007615627 |
|    clip_fraction        | 0.0528      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | -0.000661   |
|    learning_rate        | 0.0003      |
|    loss                 | 632         |
|    n_upda



Python: 3.11.14
PyTorch: 2.8.0
Device: cpu
CUDA: None


>>> Iniciando treino DQN com seed 136 <<<

Using cpu device
Logging to ./project_results\DQN\tensorboard\dqn_seed_136_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9291     |
|    time_elapsed     | 0        |
|    total_timesteps  | 406      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.963    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4305     |
|    time_elapsed     | 0        |
|    total_timesteps  | 769      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 2.94     |
|    n_updates        | 67       |
----------------------------------
----------------------------------
| rollout/       

DQN Seed 136: Mean Reward = -126.04 ± 21.11, Mean Length = 838.2 ± 324.3

>>> Iniciando treino PPO com seed 136 <<<

Using cpu device
Logging to ./project_results\PPO\tensorboard\ppo_seed_136_2
-----------------------------
| time/              |      |
|    fps             | 2481 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1725         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0041117156 |
|    clip_fraction        | 0.00933      |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -0.00772     |
|    learning_rate        | 0.0003       |
|    loss                 | 478      


✅ Treino finalizado para DQN e PPO em 3 seeds independentes!
✅ Resultados de avaliação guardados em JSON
