# Configuracion de ambiente

In [None]:
#Descarga librerías no incluidas en Colab usando pip
!pip install stable_baselines3 #Stable Baselines3 -> Framework de Reinforcement Learning
!pip install sb3-contrib #SB3-Contrib es un repositorio aparte con otros algoritmos
!pip install ale-py #ALE se utiliza para el ambiente de Atari
!pip install "gymnasium[atari,accept-rom-license]" stable-baselines3 autorom renderlab -q #Gymnasium, envs de Atari y ROM
!pip install swig
!pip install "gymnasium[box2d]" # Corrected: Added closing double quote
!AutoROM --accept-license
!pip install renderlab #usado para renderizar gym

import stable_baselines3 #importa Stable Baselines3
from stable_baselines3 import DQN #importa el agente/algoritmo de DQN
from stable_baselines3.common.logger import configure #importa herramientas de logger/debug
from stable_baselines3.common.logger import Logger, CSVOutputFormat, HumanOutputFormat #importa herramientas de logger/debug
from stable_baselines3.common.evaluation import evaluate_policy #importa herramienta de evaluación automática
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from sb3_contrib import QRDQN #importa el agente/algoritmo de QRDQN
import gymnasium as gymnasium #importa la libreria de gymnasium con las simulaciones
import renderlab #importa renderlab para los videos
from gymnasium.spaces import Box # Added for PixelObservationWrapper

import ale_py #importa ale para los ambientes de Atari
from gymnasium.wrappers import TimeLimit #importa timelimit para acortar los episodios
from stable_baselines3.common.env_util import make_atari_env #importa make_atari_env para escala de grises
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack #importa VecFrameStack para apilar frames y acelerar así el entrenamiento
from collections import deque #importa para ajustar los videos con VecFrameStack
import cv2 #importa para ajustar los videos con VecFrameStack

#!Importante:
gymnasium.register_envs(ale_py) #Hay que registrar los entornos de ALE manualmente!!!

#Importa otras librerías básicas
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import pandas as pd
import sys
import os

#Limpia los registros generados
from IPython.display import clear_output
clear_output()
print("Todas las librerías han sido instaladas correctamente.")

Todas las librerías han sido instaladas correctamente.


In [None]:
import subprocess

subprocess.run(["AutoROM"], input="Y\n", text=True)

  return datetime.utcnow().replace(tzinfo=utc)



CompletedProcess(args=['AutoROM'], returncode=0)

In [None]:
def load_weights_into_new_model(model_class, policy, env, load_path, **kwargs):
    """
    Crea un modelo NUEVO con hiperparámetros kwargs,
    carga el modelo VIEJO desde load_path,
    y copia los pesos del viejo al nuevo.
    Retorna el modelo nuevo listo para entrenar.
    """
    new_model = model_class(policy, env, **kwargs)
    old_model = model_class.load(load_path, env=env)
    new_model.set_parameters(old_model.get_parameters())
    return new_model


In [None]:
# =========================
# Callback para registrar recompensa por episodio
# =========================
class RewardLoggerCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.episode_rewards = []

    def _on_step(self) -> bool:
        if len(self.locals.get("infos", [])) > 0:
            for info in self.locals["infos"]:
                if "episode" in info:
                    self.episode_rewards.append(info["episode"]["r"])
        return True

In [None]:
# =========================
# Env wrapper: parámetros FIJOS (apto para DQN/DDQN)
# (NO recrea el env en cada reset)
# =========================
class FixedLunarLander(gymnasium.Env):
    metadata = {"render_modes": ["rgb_array", None]}

    def __init__(
        self,
        continuous=False,
        gravity=-10.0,
        enable_wind=False,
        wind_power=0.0,
        turbulence_power=0.0,
        render_mode=None,
        seed=None
    ):
        super().__init__()
        self.continuous = continuous
        self.gravity = float(gravity)
        self.enable_wind = bool(enable_wind)
        self.wind_power = float(wind_power)
        self.turbulence_power = float(turbulence_power)
        self.render_mode = render_mode
        self._seed = seed

        self.env = gymnasium.make(
            "LunarLander-v3",
            continuous=self.continuous,
            gravity=self.gravity,
            enable_wind=self.enable_wind,
            wind_power=self.wind_power,
            turbulence_power=self.turbulence_power,
            render_mode=self.render_mode
        )

        # expone spaces
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

        if self._seed is not None:
            self.env.reset(seed=self._seed)

    def reset(self, *, seed=None, options=None):
        # OJO: no se recrea el env, solo se resetea
        if seed is not None:
            self._seed = seed
        return self.env.reset(seed=self._seed, options=options)

    def step(self, action):
        return self.env.step(action)

    def render(self):
        return self.env.render()

    def close(self):
        if self.env is not None:
            self.env.close()


In [None]:
# =========================
# Trainer integrado (DQN / "DDQN en SB3")
# =========================
class LunarLanderTrainer:
    def __init__(
        self,
        model_fn,
        continuous=False,
        gravity=-10.0,
        enable_wind=False,
        wind_power=0.0,
        turbulence_power=0.0,
        total_timesteps=500_000,
        log_dir="./logs",
        n_envs=8
    ):
        self.continuous = continuous
        self.gravity = gravity # Store as instance attribute
        self.enable_wind = enable_wind # Store as instance attribute
        self.wind_power = wind_power # Store as instance attribute
        self.turbulence_power = turbulence_power # Store as instance attribute
        self.total_timesteps = total_timesteps
        self.log_dir = log_dir
        os.makedirs(log_dir, exist_ok=True)

        def make_env(i):
            def _init():
                env = gymnasium.make(
                    "LunarLander-v3",
                    continuous=False,
                    gravity=self.gravity,
                    enable_wind=self.enable_wind,
                    wind_power=self.wind_power,
                    turbulence_power=self.turbulence_power,
                    render_mode=None
                )
                # Shorter horizon helps avoid "hover forever" policies
                env = TimeLimit(env, max_episode_steps=600)

                env = Monitor(env)
                env.reset(seed=1000 + i)
                return env
            return _init

        self.env = DummyVecEnv([make_env(i) for i in range(n_envs)])

        self.model = model_fn(self.env)

        new_logger = configure(log_dir, ["stdout", "csv"])
        self.model.set_logger(new_logger)

        self.callback = RewardLoggerCallback()

    def train(self, save_path="dqn_lunarlander_model"):
        self.model.learn(
            total_timesteps=self.total_timesteps,
            log_interval=20,
            callback=self.callback
        )
        self.model.save(save_path)
        print(f"Training complete. Model saved at: {save_path}")

    def plot_rewards(self):
        if not self.callback.episode_rewards:
            print("No reward data to plot.")
            return
        plt.figure(figsize=(10, 5))
        plt.plot(self.callback.episode_rewards, label="Episode Reward")
        plt.xlabel("Episodes")
        plt.ylabel("Reward")
        plt.title("Training Rewards Over Time")
        plt.grid()
        plt.legend()
        plt.tight_layout()
        plt.show()

    def evaluate(self, n_eval_episodes=100):
        import numpy as np
        from stable_baselines3.common.vec_env import DummyVecEnv
        from stable_baselines3.common.monitor import Monitor
        from stable_baselines3.common.evaluation import evaluate_policy

        def make_eval_env():
            env = FixedLunarLander(
                continuous=self.continuous,
                gravity=self.gravity,
                enable_wind=self.enable_wind,
                wind_power=self.wind_power,
                turbulence_power=self.turbulence_power,
                render_mode=None,
                seed=None  # no fijar seed aquí
            )
            return Monitor(env)

        eval_env = DummyVecEnv([make_eval_env])

        rewards = []
        for ep in range(n_eval_episodes):
            # cambia seed cada episodio para que el terreno cambie
            eval_env.env_method("reset", seed=10_000 + ep)
            r, _ = evaluate_policy(self.model, eval_env, n_eval_episodes=1, deterministic=True)
            rewards.append(float(r))

        mean_reward = float(np.mean(rewards))
        std_reward = float(np.std(rewards))

        print(f"Eval mean reward over {n_eval_episodes} episodes: {mean_reward:.2f} +/- {std_reward:.2f}")
        print("✅ SOLVED (>=200)" if mean_reward >= 200 else "❌ Not solved yet")
        return mean_reward, std_reward


    def generate_video(self, model_path="dqn_lunarlander_model", out_dir="./output_video"):
        os.makedirs(out_dir, exist_ok=True)

        model = self.model.__class__.load(model_path)

        # Env para video con render
        # Using trainer's own initialized parameters for consistency
        env = FixedLunarLander(
            continuous=self.continuous,
            gravity=self.gravity,
            enable_wind=self.enable_wind,
            wind_power=self.wind_power,
            turbulence_power=self.turbulence_power,
            render_mode="rgb_array",
            seed=123
        )
        env = renderlab.RenderFrame(env, out_dir)

        obs, info = env.reset()
        terminated, truncated = False, False
        total_reward = 0.0

        while not (terminated or truncated):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            total_reward += reward

        print("Recompensa obtenida en el episodio (video):", total_reward)
        env.play()
        env.close()

# Stage 1 strong

In [None]:
# =========================
# Model factory (DQN / "DDQN en SB3")
# =========================
def create_dqn_model_stage1_strong(env):
    return DQN(
        "MlpPolicy",
        env,
        learning_rate=1e-3,
        buffer_size=200_000,
        learning_starts=10_000,
        batch_size=64,
        gamma=0.99,
        train_freq=4,
        gradient_steps=1,
        target_update_interval=1_000,
        exploration_fraction=0.2,
        exploration_initial_eps=1.0,
        exploration_final_eps=0.05,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1
    )



In [None]:
# from stable_baselines3 import DQN

# def run_stage(
#     stage_name: str,
#     timesteps: int,
#     gravity: float,
#     enable_wind: bool,
#     wind_power: float,
#     turbulence_power: float,
#     model_fn,
#     load_model_path: str | None,
#     save_model_path: str,
#     n_envs: int = 8,
# ):
#     # crea trainer con parámetros del escenario
#     trainer = LunarLanderTrainer(
#         model_fn=model_fn,
#         continuous=False,
#         gravity=gravity,
#         enable_wind=enable_wind,
#         wind_power=wind_power,
#         turbulence_power=turbulence_power,
#         total_timesteps=timesteps,
#         log_dir=f"./logs_{stage_name}",
#         n_envs=n_envs
#     )

#     # si hay modelo previo, lo cargamos y seguimos entrenando en el nuevo env
#     if load_model_path is not None:
#         trainer.model = DQN.load(load_model_path, env=trainer.env)
#         print(f"[{stage_name}] Loaded model: {load_model_path}")

#     # entrena, guarda y evalúa
#     trainer.train(save_path=save_model_path)
#     trainer.evaluate(
#         n_eval_episodes=100
#         # Removed gravity, enable_wind, wind_power, turbulence_power as evaluate method uses trainer's own attributes
#     )
#     return trainer

In [None]:
trainer1 = LunarLanderTrainer(
    model_fn=create_dqn_model_stage1_strong,
    gravity=-10.0,
    enable_wind=False,
    wind_power=0.0,
    turbulence_power=0.0,
    total_timesteps=1_200_000,
    log_dir="./logs_stage1_sanity",
    n_envs=1
)
trainer1.train("lander_stage1_sanity")
trainer1.evaluate(100)







Using cuda device


  return datetime.utcnow().replace(tzinfo=utc)



Logging to ./logs_stage1_sanity
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 86.8     |
|    ep_rew_mean      | -224     |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 5213     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1736     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 91.7     |
|    ep_rew_mean      | -211     |
|    exploration_rate | 0.985    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 5994     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3667     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90.2     |
|    ep_rew_mean      | -211     |
|    exploration_rate | 0.979    |
| time/               |

(265.48665816, 56.895356254453816)

In [None]:
print(trainer1.env.action_space)
print(trainer1.env.observation_space)


Discrete(4)
Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)


# Etapa 1 Fine Tune

In [None]:
def create_dqn_model_finetune(env):
    return DQN(
        "MlpPolicy",
        env,
        learning_rate=2.5e-4,       # lower than 1e-3
        buffer_size=300_000,
        learning_starts=0,          # fine-tuning: start learning immediately
        batch_size=64,
        gamma=0.99,
        train_freq=4,
        gradient_steps=1,
        target_update_interval=1_000,
        exploration_fraction=0.05,  # very little exploration
        exploration_initial_eps=0.05,
        exploration_final_eps=0.01,
        policy_kwargs=dict(net_arch=[256, 256]),
        verbose=1,
    )


In [None]:
trainer1_ft = LunarLanderTrainer(
    model_fn=create_dqn_model_finetune,
    gravity=-10.0,
    enable_wind=False,
    wind_power=0.0,
    turbulence_power=0.0,
    total_timesteps=400_000,           # +400k polish
    log_dir="./logs_stage1_finetune",
    n_envs=1
)
trainer1_ft.model = DQN.load("lander_stage1_sanity", env=trainer1_ft.env)

trainer1_ft.train("lander_stage1_finetune")
trainer1_ft.evaluate(100)


Using cuda device
Logging to ./logs_stage1_finetune
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 87.4     |
|    ep_rew_mean      | -156     |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 7116     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1748     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 88       |
|    ep_rew_mean      | -163     |
|    exploration_rate | 0.958    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 7295     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3520     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 90.6     |
|    ep_rew_mean      | -166     |
|    exploration_rate | 0.935    |
| t

(255.70815299000006, 47.47777028642905)

# Entrenamiento 2A sin viento

In [None]:
# Variación gravedad (sin viento)
# Stage 2a: gravity -11
trainer2a = LunarLanderTrainer(
    model_fn=create_dqn_model_finetune,
    gravity=-11.0,
    enable_wind=False,
    wind_power=0.0,
    turbulence_power=0.0,
    total_timesteps=250_000,
    log_dir="./logs_stage2a_g-11",
    n_envs=1
)
trainer2a.model = DQN.load("lander_stage1_finetune", env=trainer2a.env)
trainer2a.train("lander_stage2a_g-11")
trainer2a.evaluate(100)




Using cuda device
Logging to ./logs_stage2a_g-11
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 86.4     |
|    ep_rew_mean      | -191     |
|    exploration_rate | 0.967    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 6789     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1728     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 89.9     |
|    ep_rew_mean      | -182     |
|    exploration_rate | 0.932    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 7030     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3596     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 87.9     |
|    ep_rew_mean      | -183     |
|    exploration_rate | 0.9      |
| time

(172.11065156, 111.14902302423864)

# Etapa 2 graved=-9 No viento

In [None]:
# Stage 2b: gravity -9
trainer2b = LunarLanderTrainer(
    model_fn=create_dqn_model_finetune,
    gravity=-9.0,
    enable_wind=False,
    wind_power=0.0,
    turbulence_power=0.0,
    total_timesteps=250_000,
    log_dir="./logs_stage2b_g-9",
    n_envs=1
)
trainer2b.model = DQN.load("lander_stage2a_g-11", env=trainer2b.env)
trainer2b.train("lander_stage2b_g-9")
trainer2b.evaluate(100)

Using cuda device
Logging to ./logs_stage2b_g-9
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 94.1     |
|    ep_rew_mean      | -178     |
|    exploration_rate | 0.964    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 7222     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1882     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 96       |
|    ep_rew_mean      | -183     |
|    exploration_rate | 0.927    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 7242     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3841     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 97.5     |
|    ep_rew_mean      | -193     |
|    exploration_rate | 0.889    |
| time/

(242.99547177000005, 68.96400750894459)

In [None]:
# Optional: gravity -11 (harder)
trainer2c = LunarLanderTrainer(
    model_fn=create_dqn_model_finetune,
    gravity=-11.0,
    enable_wind=False,
    wind_power=0.0,
    turbulence_power=0.0,
    total_timesteps=250_000,
    log_dir="./logs_stage2c_g-12",
    n_envs=1
)
trainer2c.model = DQN.load("lander_stage2b_g-9", env=trainer2c.env)e
trainer2c.train("lander_stage2c_g-12")
trainer2c.evaluate(100)

SyntaxError: invalid syntax (ipython-input-1099001153.py, line 12)

# Etapa 3 viento fine tune

In [None]:
stage3_kwargs = dict(
    learning_rate=2.5e-4,
    buffer_size=300_000,
    learning_starts=10_000,
    batch_size=64,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=1000,
    exploration_fraction=0.15,
    exploration_initial_eps=0.20,
    exploration_final_eps=0.05,
    policy_kwargs=dict(net_arch=[256, 256]),
    verbose=1
)


In [None]:
trainer3 = LunarLanderTrainer(
    model_fn=lambda env: DQN("MlpPolicy", env, verbose=1),  # placeholder
    gravity=-10.0,
    enable_wind=True,
    wind_power=8.0,
    turbulence_power=0.5,
    total_timesteps=500_000,
    log_dir="./logs_stage3_wind_soft_fix",
    n_envs=1
)

# ✅ AQUÍ defines de dónde vienes (tu mejor modelo SIN viento)
# Recomendado: el que ya resolvió Stage 1 (>=200)
load_path_stage3 = "lander_stage1_finetune"   # <-- AJUSTA si usas otro

trainer3.model = load_weights_into_new_model(
    DQN, "MlpPolicy", trainer3.env,
    load_path=load_path_stage3,
    **stage3_kwargs
)

trainer3.train("lander_stage3_wind_soft_fix")
trainer3.evaluate(100)
trainer3.generate_video(model_path="lander_stage3_wind_soft_fix", out_dir="./output_video_stage3")


## Etapa 4

In [None]:
stage4_kwargs = dict(
    learning_rate=1e-4,
    buffer_size=500_000,
    learning_starts=50_000,
    batch_size=128,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=500,
    exploration_fraction=0.30,
    exploration_initial_eps=0.30,
    exploration_final_eps=0.05,
    policy_kwargs=dict(net_arch=[256, 256]),
    verbose=1
)


In [None]:
trainer4 = LunarLanderTrainer(
    model_fn=lambda env: DQN("MlpPolicy", env, verbose=1),  # placeholder
    gravity=-10.0,
    enable_wind=True,
    wind_power=18.0,
    turbulence_power=1.5,
    total_timesteps=900_000,
    log_dir="./logs_stage4_wind_strong_v2",
    n_envs=1
)

# ✅ Stage 4 SIEMPRE debe partir del modelo arreglado de Stage 3
trainer4.model = load_weights_into_new_model(
    DQN, "MlpPolicy", trainer4.env,
    load_path="lander_stage3_wind_soft_fix",
    **stage4_kwargs
)

trainer4.train("lander_stage4_wind_strong_v2")
trainer4.evaluate(100)
trainer4.generate_video(model_path="lander_stage4_wind_strong_v2", out_dir="./output_video_stage4")


In [None]:
# trainer3_1 = LunarLanderTrainer(
#     model_fn=create_dqn_model_wind_finetune,
#     gravity=-10.0,
#     enable_wind=True,
#     wind_power=8.0,
#     turbulence_power=0.5,
#     total_timesteps=400_000,           # +400k polish
#     log_dir="./logs_stage3_1_wind_polish",
#     n_envs=1
# )

# trainer3_1.model = DQN.load("lander_stage3_wind_soft", env=trainer3_1.env)

# trainer3_1.train("lander_stage3_1_wind_polish")
# trainer3_1.evaluate(100)


In [None]:
# from stable_baselines3 import DQN

# def create_dqn_model_stage4(env):
#     return DQN(
#         "MlpPolicy",
#         env,
#         learning_rate=2.5e-4,          # volver a un LR que ya te funcionó en viento :contentReference[oaicite:3]{index=3}
#         buffer_size=500_000,
#         learning_starts=50_000,        # <- clave: llena buffer en el nuevo dominio antes de aprender
#         batch_size=128,                # batch más grande suele estabilizar en viento fuerte
#         gamma=0.99,
#         train_freq=4,
#         gradient_steps=1,
#         target_update_interval=500,    # updates más frecuentes ayudan a adaptarse (antes era 1000)
#         exploration_fraction=0.30,     # <- más tiempo explorando
#         exploration_initial_eps=0.30,  # <- más exploración al arrancar Stage 4
#         exploration_final_eps=0.05,    # <- no la mates a 0.01 tan rápido en dominio nuevo
#         policy_kwargs=dict(net_arch=[256, 256]),
#         verbose=1,
#     )


In [None]:
# trainer4 = LunarLanderTrainer(
#     model_fn=lambda env: DQN("MlpPolicy", env, verbose=1),  # placeholder
#     gravity=-10.0,
#     enable_wind=True,
#     wind_power=18.0,
#     turbulence_power=1.5,
#     total_timesteps=900_000,          # dale más; 700k a veces queda corto en este salto
#     log_dir="./logs_stage4_strong_v2",
#     n_envs=1                           # importante: en tus stages duros te fue mejor con n_envs=1
# )

# trainer4.model = load_weights_into_new_model(
#     DQN,
#     "MlpPolicy",
#     trainer4.env,
#     load_path="lander_stage3_fix_real",
#     learning_rate=2.5e-4,
#     buffer_size=500_000,
#     learning_starts=50_000,
#     batch_size=128,
#     gamma=0.99,
#     train_freq=4,
#     gradient_steps=1,
#     target_update_interval=500,
#     exploration_fraction=0.30,
#     exploration_initial_eps=0.30,
#     exploration_final_eps=0.05,
#     policy_kwargs=dict(net_arch=[256, 256]),
#     verbose=1
# )

# trainer4.train("lander_stage4_strong_v2")
# trainer4.evaluate(100)
# trainer4.generate_video(model_path="lander_stage4_strong_v2", out_dir="./output_video_stage4_v2")


In [None]:
scenarios = [
    (-11.9, False, 0.0, 0.0),
    (-9.0,  False, 0.0, 0.0),
    (-10.0, True,  8.0, 0.5),
    (-10.0, True,  18.0, 1.5),
    (-11.0, True,  20.0, 2.0),
]

means = []
for g, w, wp, tp in scenarios:
    tmp = LunarLanderTrainer(
        model_fn=lambda env: DQN("MlpPolicy", env, verbose=0),
        gravity=g,
        enable_wind=w,
        wind_power=wp,
        turbulence_power=tp,
        total_timesteps=1,
        log_dir="./logs_eval_tmp",
        n_envs=1
    )
    tmp.model = DQN.load("lander_stage4_wind_strong_v2", env=tmp.env)
    m, s = tmp.evaluate(50)
    means.append(m)

overall = float(np.mean(means))
print("Overall mean across scenarios:", overall)
print("✅ SOLVED overall (>=200)" if overall >= 200 else "❌ Not solved overall")


In [None]:
# # =========================
# # STAGE 1: Fácil (sin viento, gravedad fija)
# # =========================
# trainer1 = run_stage(
#     stage_name="stage1_easy",
#     timesteps=1_200_000,
#     gravity=-10.0,
#     enable_wind=False,
#     wind_power=0.0,
#     turbulence_power=0.0,
#     model_fn=create_dqn_model_stage1_strong,
#     load_model_path=None,
#     save_model_path="lander_stage1",
#     n_envs=1
# )

In [None]:
# trainer1.generate_video(model_path="lander_stage1")
# trainer1.plot_rewards()

In [None]:
# # =========================
# # STAGE 2: Generaliza en gravedad (sin viento)
# # Re-entrenamos en 2 gravedades distintas para robustez
# # =========================
# trainer2a = run_stage(
#     stage_name="stage2_grav_-11",
#     timesteps=300_000,
#     gravity=-11.0,
#     enable_wind=False,
#     wind_power=0.0,
#     turbulence_power=0.0,
#     load_model_path="lander_stage1",
#     save_model_path="lander_stage2a",
#     n_envs=8
# )

# trainer2b = run_stage(
#     stage_name="stage2_grav_-9",
#     timesteps=300_000,
#     gravity=-9.0,
#     enable_wind=False,
#     wind_power=0.0,
#     turbulence_power=0.0,
#     load_model_path="lander_stage2a",
#     save_model_path="lander_stage2",
#     n_envs=8
# )


In [None]:
# trainer2b.generate_video(model_path="lander_stage2")
# trainer2b.plot_rewards()

In [None]:

# # =========================
# # STAGE 3: Viento suave
# # =========================
# trainer3 = run_stage(
#     stage_name="stage3_wind_soft",
#     timesteps=500_000,
#     gravity=-10.0,
#     enable_wind=True,
#     wind_power=8.0,
#     turbulence_power=0.5,
#     load_model_path="lander_stage2",
#     save_model_path="lander_stage3",
#     n_envs=8
# )

In [None]:
# trainer3.generate_video(model_path="lander_stage3")
# trainer3.plot_rewards()

In [None]:
# continuous = False
# gravity = -10.0
# enable_wind = False
# wind_power = 0.0
# turbulence_power = 0.0
# env_render = gymnasium.make(
#     "LunarLander-v3",
#     continuous=continuous,
#     gravity=gravity,
#     enable_wind=enable_wind,
#     wind_power=wind_power,
#     turbulence_power=turbulence_power,
#     render_mode='rgb_array' # Added render_mode for RenderFrame compatibility
# )
# env_render = renderlab.RenderFrame(env_render, "./output")

# terminated = False
# truncated = False
# total_reward = 0

# obs, info = env_render.reset()

# while not (terminated or truncated):
#   action = 0
#   obs, reward, terminated, truncated, info = env_render.step(action)
#   total_reward += reward

# print("Recompensa", total_reward)

# env_render.play()

In [None]:
# # Callback personalizado para registrar las recompensas por episodio durante el entrenamiento
# class RewardLoggerCallback(BaseCallback):
#     def __init__(self):
#         super().__init__()
#         self.episode_rewards = []  # Lista para almacenar las recompensas de cada episodio

#     def _on_step(self) -> bool:
#         # Verifica si hay información de episodios en los 'infos' proporcionados por el entorno
#         if len(self.locals["infos"]) > 0:
#             for info in self.locals["infos"]:
#                 if "episode" in info:
#                     # Agrega la recompensa del episodio a la lista
#                     self.episode_rewards.append(info["episode"]["r"])
#         return True  # Continúa el entrenamiento

# # Custom wrapper to provide pixel observations for CnnPolicy
# class PixelObservationWrapper(gymnasium.ObservationWrapper):
#     def __init__(self, env):
#         super().__init__(env)
#         # The new observation space will be a single grayscale 84x84x1 image
#         self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) # Changed shape to 3D

#     def observation(self, obs): # obs here is the original state vector from env.step()
#         # Render the environment to get pixel observations
#         rgb_array = self.env.render()
#         processed_frame = self._preprocess_frame(rgb_array)
#         return processed_frame.reshape(84, 84, 1) # Reshape to (84, 84, 1)

#     def _preprocess_frame(self, frame):
#         # Convert to grayscale and resize
#         gray = frame.mean(axis=2).astype(np.uint8) # Assuming RGB (H, W, 3)
#         resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
#         return resized # Returns (84, 84)

# # Clase encargada de entrenar al agente en el entorno Freeway
# class LunaLanderTrainer:
#     def __init__(
#         self,
#         model_fn,
#         continuous=False, # Corrected: 'continous' to 'continuous'
#         gravity = -10.0,
#         enable_wind = False,
#         wind_power = 0.0,
#         turbulence_power = 0.0,
#         total_timesteps=204_800,
#         log_dir="./logs"
#         ):
#         self.continuous = continuous # Corrected: 'continous' to 'continuous'
#         self.gravity = gravity
#         self.enable_wind = enable_wind
#         self.wind_power = wind_power
#         self.turbulence_power = turbulence_power
#         self.total_timesteps = total_timesteps  # Número total de pasos de entrenamiento
#         self.log_dir = log_dir  # Carpeta donde se almacenan los logs
#         os.makedirs(log_dir, exist_ok=True)  # Crea la carpeta si no existe

#         # Create the base LunarLander environment with rgb_array render_mode
#         base_env = gymnasium.make(
#             "LunarLander-v3",
#             continuous=self.continuous,
#             gravity=self.gravity,
#             enable_wind=self.enable_wind,
#             wind_power=self.wind_power,
#             turbulence_power=self.turbulence_power,
#             render_mode='rgb_array' # Important for pixel observations
#         )

#         # Wrap with PixelObservationWrapper to get 84x84 grayscale observations
#         pixel_env = PixelObservationWrapper(base_env)

#         # Import 'Monitor' from stable_baselines3.common.monitor
#         from stable_baselines3.common.monitor import Monitor
#         monitored_env = Monitor(pixel_env, filename=os.path.join(log_dir, "monitor.csv"))

#         # Create a DummyVecEnv to wrap the single environment
#         # DummyVecEnv expects a list of callable environment constructors
#         vec_env_single = DummyVecEnv([lambda: monitored_env])

#         # Apply VecFrameStack to stack 4 frames for CnnPolicy input
#         self.env = VecFrameStack(vec_env_single, n_stack=4) # Removed channels_first=True

#         self.model = model_fn(self.env)  # Crea el modelo usando la función proporcionada

#         # Configura un logger personalizado que guarda en consola y en CSV
#         new_logger = configure(log_dir, ["stdout", "csv"])
#         self.model.set_logger(new_logger)

#         # Instancia del callback para registrar recompensas
#         self.callback = RewardLoggerCallback()

#     def train(self, save_path="dqn_lunarlander_model"):
#         # Entrena el modelo y guarda el archivo resultante
#         self.model.learn(total_timesteps=self.total_timesteps, log_interval=20, callback=self.callback)
#         self.model.save(save_path)
#         print("Training complete. Model saved.")  # Mensaje al finalizar

#     def plot_rewards(self):
#         # Grafica la evolución de las recompensas por episodio
#         if not self.callback.episode_rewards:
#             print("No reward data to plot.")  # Si no hay datos, notifica
#             return
#         plt.figure(figsize=(10, 5))
#         plt.plot(self.callback.episode_rewards, label="Episode Reward")
#         plt.xlabel("Episodes")
#         plt.ylabel("Reward")
#         plt.title("Training Rewards Over Time")
#         plt.grid()
#         plt.legend()
#         plt.tight_layout()
#         plt.show()

#     def generate_video(self, model_path="dqn_lunarlander_model"):
#         # Carga el modelo entrenado y genera un video del agente actuando
#         model = self.model.__class__.load(model_path)

#         # Crea el entorno con modo de renderizado RGB para capturar imágenes
#         env = gymnasium.make(
#             "LunarLander-v3", # Changed 'ALE/LunarLander-v3' to 'LunarLander-v3' as 'ALE/' prefix is not needed for LunarLander
#             render_mode="rgb_array",
#             continuous=self.continuous,
#             gravity=self.gravity,
#             enable_wind=self.enable_wind,
#             wind_power=self.wind_power,
#             turbulence_power=self.turbulence_power)
#         env = renderlab.RenderFrame(env, "./output")  # Envuelve el entorno para capturar video

#         frame_stack = deque(maxlen=4)  # Pila de frames para mantener la entrada en el mismo formato que durante el entrenamiento

#         # Función para preprocesar las observaciones: convierte a escala de grises y redimensiona
#         def preprocess(frame_rgb):
#             gray = frame_rgb.mean(axis=2).astype(np.uint8)
#             resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
#             return resized.reshape(84, 84, 1) # Ensure 3D output here too

#         obs, info = env.reset() # This `obs` is the state vector, not pixel data.
#         # Get initial pixel observation for the frame stack
#         rgb_initial = env.render() # Render to get the RGB array
#         preprocessed_initial = preprocess(rgb_initial)
#         for _ in range(4):
#             frame_stack.append(preprocessed_initial)  # Initialize stack with (84, 84, 1) frames

#         terminated = False
#         truncated = False
#         total_reward = 0  # Acumula la recompensa total del episodio

#         # Ejecuta el episodio hasta que termine
#         while not (terminated or truncated):
#             stacked_obs = np.stack(frame_stack, axis=0) # This will create (4, 84, 84, 1)
#             # NatureCNN expects (batch_size, channels, height, width) or (channels, height, width) for predict
#             # Transpose from (N, H, W, C) to (N, C, H, W) for NatureCNN
#             stacked_obs_for_predict = stacked_obs.transpose(0, 3, 1, 2).squeeze(axis=1) # Resulting shape (4, 84, 84)
#             action, _ = model.predict(stacked_obs_for_predict, deterministic=True)  # Predice la acción a tomar

#             obs, reward, terminated, truncated, info = env.step(action)  # Execute the action (obs here is the state vector)
#             total_reward += reward  # Acumula la recompensa

#             # Get new pixel observation after step
#             rgb_after_step = env.render()
#             preprocessed_after_step = preprocess(rgb_after_step)  # Preprocesa la nueva observación
#             frame_stack.append(preprocessed_after_step)  # Actualiza la pila de frames

#         print("Recompensa obtenida en el episodio:", total_reward)  # Muestra la recompensa obtenida
#         env.play()  # Reproduce el video generado

In [None]:
# def create_model(env):
#     return DQN(
#         "CnnPolicy",
#         env,
#         learning_rate=0.0001,
#         buffer_size=50000,
#         learning_starts=1024,
#         batch_size=32,
#         gamma=0.99,
#         train_freq=4,
#         target_update_interval=10000,
#         exploration_fraction=0.5,
#         exploration_initial_eps=1.0,
#         exploration_final_eps=0.05,
#         verbose=1,
#         policy_kwargs=dict(net_arch=[256, 256]),
#     )

# trainer = LunaLanderTrainer(
#     model_fn=create_model,
#     continuous=False,
#     gravity = -10.0,
#     enable_wind = False,
#     wind_power = 0.0,
#     turbulence_power = 0.0,
#     total_timesteps=100_000
#     )
# trainer.train()


In [None]:
# trainer.generate_video()
# trainer.plot_rewards()

In [None]:
# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Define the path in Google Drive to save the model
# drive_save_dir = '/content/drive/MyDrive/DRL/my_models'
# import os
# os.makedirs(drive_save_dir, exist_ok=True)

# # Define the local path of the trained model (default name from trainer.train())
# local_model_path = 'dqn_lunarlander_model.zip'

# # Define the full path in Google Drive
# drive_model_path = os.path.join(drive_save_dir, local_model_path)

# # Copy the model to Google Drive
# import shutil
# if os.path.exists(local_model_path):
#     shutil.copy(local_model_path, drive_model_path)
#     print(f"Model saved to Google Drive at: {drive_model_path}")
# else:
#     print(f"Error: Local model '{local_model_path}' not found. Please ensure the training completed successfully.")