# LunarLander (DQN / DDQN via SB3)

Este notebook entrena y evalua un agente discreto de LunarLander usando DQN de Stable-Baselines3.
Incluye checkpoints, logging con TensorBoard y verificacion de >=200 de recompensa promedio en 10 episodios.

In [None]:
# Instalar dependencias (compatible con Colab)
!pip -q install gymnasium[box2d] stable-baselines3 tensorboard

import os
import json
import time
import subprocess
import numpy as np
import torch
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, TimeLimit
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback

ROOT_DIR = os.getcwd()
ART_DIR = os.path.join(ROOT_DIR, "artifacts", "lunarlander")
CKPT_DIR = os.path.join(ART_DIR, "checkpoints")
LOG_DIR = os.path.join(ART_DIR, "logs")
VIDEO_DIR = os.path.join(ART_DIR, "videos")
for d in [CKPT_DIR, LOG_DIR, VIDEO_DIR]:
    os.makedirs(d, exist_ok=True)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

print("Torch:", torch.__version__)
print("Gymnasium:", gym.__version__)
try:
    print(subprocess.check_output(["nvidia-smi"]).decode())
except Exception:
    print("nvidia-smi no disponible")

In [None]:
# Fabrica de entorno
def make_env(render_mode=None, seed=None):
    env = gym.make(
        "LunarLander-v3",
        continuous=False,
        gravity=-10.0,
        enable_wind=False,
        wind_power=0.0,
        turbulence_power=0.0,
        render_mode=render_mode,
    )
    env = TimeLimit(env, max_episode_steps=600)
    env = Monitor(env)
    if seed is not None:
        env.reset(seed=seed)
    return env

train_env = DummyVecEnv([lambda: make_env(seed=SEED)])
eval_env = DummyVecEnv([lambda: make_env(seed=SEED + 1000)])

In [None]:
# Hiperparametros (ajusta si no llega a >=200)
config = {
    "learning_rate": 1e-3,
    "buffer_size": 200_000,
    "learning_starts": 10_000,
    "batch_size": 64,
    "gamma": 0.99,
    "train_freq": 4,
    "gradient_steps": 1,
    "target_update_interval": 1000,
    "exploration_fraction": 0.2,
    "exploration_initial_eps": 1.0,
    "exploration_final_eps": 0.05,
    "policy_net_arch": [256, 256],
    "total_timesteps": 1_200_000,
    "seed": SEED,
}

with open(os.path.join(CKPT_DIR, "config.json"), "w") as f:
    json.dump(config, f, indent=2)

model = DQN(
    "MlpPolicy",
    train_env,
    learning_rate=config["learning_rate"],
    buffer_size=config["buffer_size"],
    learning_starts=config["learning_starts"],
    batch_size=config["batch_size"],
    gamma=config["gamma"],
    train_freq=config["train_freq"],
    gradient_steps=config["gradient_steps"],
    target_update_interval=config["target_update_interval"],
    exploration_fraction=config["exploration_fraction"],
    exploration_initial_eps=config["exploration_initial_eps"],
    exploration_final_eps=config["exploration_final_eps"],
    policy_kwargs=dict(net_arch=config["policy_net_arch"]),
    tensorboard_log=LOG_DIR,
    verbose=1,
    seed=SEED,
)

checkpoint_callback = CheckpointCallback(
    save_freq=100_000,
    save_path=CKPT_DIR,
    name_prefix="dqn_lunarlander",
)

start_time = time.time()
model.learn(total_timesteps=config["total_timesteps"], callback=checkpoint_callback, log_interval=10)
train_time = time.time() - start_time
model.save(os.path.join(CKPT_DIR, "dqn_lunarlander_final"))
print(f"Tiempo de entrenamiento (s): {train_time:.1f}")

In [None]:
# Evaluacion (10 episodios)
def eval_model(model, n_episodes=10):
    rewards = []
    for ep in range(n_episodes):
        eval_env.env_method("reset", seed=SEED + 2000 + ep)
        r, _ = evaluate_policy(model, eval_env, n_eval_episodes=1, deterministic=True)
        rewards.append(float(r))
    mean_r = float(np.mean(rewards))
    std_r = float(np.std(rewards))
    print(f"Recompensa media en {n_episodes} episodios: {mean_r:.2f} +/- {std_r:.2f}")
    print("RESUELTO (>=200)" if mean_r >= 200 else "NO RESUELTO")
    return rewards

eval_rewards = eval_model(model, n_episodes=10)

In [None]:
# Exportar un video corto de evaluacion
video_env = make_env(render_mode="rgb_array", seed=SEED + 9999)
video_env = RecordVideo(video_env, video_folder=VIDEO_DIR, name_prefix="lunarlander_eval")
obs, info = video_env.reset()
terminated, truncated = False, False
while not (terminated or truncated):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = video_env.step(action)
video_env.close()
print("Video guardado en:", VIDEO_DIR)

## TensorBoard

Ejecuta en Colab:

```
%load_ext tensorboard
%tensorboard --logdir artifacts/lunarlander/logs
```

## Reporte tecnico (completar despues del entrenamiento)
- Algoritmo: DQN (SB3)
- Hiperparametros: ver config.json
- Librerias y versiones: impresas en la celda de setup
- Hardware: salida de nvidia-smi en la celda de setup
- Tiempo de entrenamiento: impreso al finalizar
- Resultados: media/desviacion en 10 episodios
- Conclusiones: agrega observaciones y siguientes pasos