In [None]:
# ─── INSTALACIONES ─────────────────────────────────────────────────────────────
!pip install vizdoom==1.2.4 pyvirtualdisplay imageio imageio-ffmpeg opencv-python-headless stable-baselines3 --quiet


In [None]:
# ─── IMPORTS ───────────────────────────────────────────────────────────────────
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import imageio
import cv2

from pyvirtualdisplay import Display
from collections import deque
from IPython.display import Video, display

import torch as th
import torch.nn as nn

import vizdoom as vzd

import gym
from gym.wrappers import FrameStack

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecTransposeImage, VecFrameStack
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

# ─── CONFIGURACIÓN GPU Y DISPLAY ───────────────────────────────────────────────
device = th.device("cuda" if th.cuda.is_available() else "cpu")
Display(visible=0, size=(640, 480)).start()


In [None]:
# ─── DESCARGA DEL ESCENARIO PERSONALIZADO ──────────────────────────────────────
!wget -q https://raw.githubusercontent.com/lkiel/rl-doom/develop/scenarios/bots_deathmatch_multimaps.cfg -O bots_deathmatch_multimaps.cfg
!wget -q https://raw.githubusercontent.com/lkiel/rl-doom/develop/scenarios/bots_deathmatch_multimaps.wad -O bots_deathmatch_multimaps.wad


In [None]:
# ─── ARCHIVO DE CONFIGURACIÓN PERSONALIZADA ────────────────────────────────────
with open("reward_shaped.cfg", "w") as f:
    f.write("""
episode_timeout = 4096
screen_resolution = RES_160X120
screen_format = RGB24
render_hud = false
render_weapon = false
render_crosshair = false
window_visible = false

available_buttons =
    {
        ATTACK
        MOVE_FORWARD
        TURN_LEFT
        TURN_RIGHT
        MOVE_LEFT
        MOVE_RIGHT
    }

available_game_variables =
    {
        KILLCOUNT
        DAMAGECOUNT
        SELECTED_WEAPON
        SELECTED_WEAPON_AMMO
        HEALTH
        ARMOR
        POSITION_X
        POSITION_Y
    }

mode = PLAYER
""")


In [None]:
# ─── FUNCIÓN PARA INICIALIZAR EL JUEGO ─────────────────────────────────────────
def create_game():
    game = vzd.DoomGame()
    game.load_config("reward_shaped.cfg")
    game.set_doom_scenario_path("bots_deathmatch_multimaps.wad")
    game.set_doom_map("M")
    game.add_game_args("-host 1 -deathmatch +sv_spawnfarthest 1 +viz_nocheat 0 +sv_forcerespawn 1")
    game.set_mode(vzd.Mode.PLAYER)
    game.init()
    game.send_game_command("removebots")
    for _ in range(4):
        game.send_game_command("addbot")
    return game

In [None]:
class DoomShapedEnv(gym.Env):
    """
    Entorno Gym personalizado para ViZDoom con reward shaping.
    Observación: imagen RGB 160x120 (canal primero).
    Acción: una de 14 combinaciones discretas.
    """
    def __init__(self, frame_skip=1):
        super().__init__()
        self.frame_skip = frame_skip
        self.game = create_game()
        self.game.new_episode()

        # Observación: (C, H, W)
        self.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(3, 120, 160), dtype=np.uint8
        )

        # Definimos 14 combinaciones de acciones manuales
        self.actions = [
            [1, 0, 0, 0, 0, 0],  # atacar
            [0, 1, 0, 0, 0, 0],  # avanzar
            [0, 0, 1, 0, 0, 0],  # girar izquierda
            [0, 0, 0, 1, 0, 0],  # girar derecha
            [0, 0, 0, 0, 1, 0],  # moverse izquierda
            [0, 0, 0, 0, 0, 1],  # moverse derecha
            [0, 1, 1, 0, 0, 0],  # avanzar + girar izq
            [0, 1, 0, 1, 0, 0],  # avanzar + girar der
            [1, 0, 1, 0, 0, 0],  # atacar + girar izq
            [1, 0, 0, 1, 0, 0],  # atacar + girar der
            [1, 1, 0, 0, 0, 0],  # atacar + avanzar
            [1, 0, 0, 0, 1, 0],  # atacar + moverse izq
            [1, 0, 0, 0, 0, 1],  # atacar + moverse der
            [0, 0, 0, 0, 0, 0],  # Nada
        ]
        self.action_space = gym.spaces.Discrete(len(self.actions))

        # Inicialización de variables internas
        self.last_health = 100
        self.last_armor = 0
        self.last_pos = (
            self.game.get_game_variable(vzd.GameVariable.POSITION_X),
            self.game.get_game_variable(vzd.GameVariable.POSITION_Y)
        )
        self.last_damage = 0
        self.ammo_state = self._get_ammo_state()
        self.weapon_state = self._get_weapon_state()
        self.last_kills = self.game.get_game_variable(vzd.GameVariable.KILLCOUNT)
        self.last_weapon_id = self.game.get_game_variable(vzd.GameVariable.SELECTED_WEAPON)

    def _get_ammo_state(self):
        ammo = np.zeros(10, dtype=np.int32)
        for i in range(10):
            ammo[i] = self.game.get_game_variable(getattr(vzd.GameVariable, f"AMMO{i}"))
        return ammo

    def _get_weapon_state(self):
        weapons = np.zeros(10, dtype=np.int32)
        for i in range(10):
            weapons[i] = self.game.get_game_variable(getattr(vzd.GameVariable, f"WEAPON{i}"))
        return weapons

    def step(self, action_idx):
        if self.game.is_episode_finished():
            obs = self.reset()
            return obs, 0.0, True, {}

        action = self.actions[action_idx]
        reward = self.game.make_action(action, self.frame_skip)
        done = self.game.is_episode_finished()
        state = self.game.get_state()
        obs = np.zeros(self.observation_space.shape, dtype=np.uint8) if state is None else np.transpose(state.screen_buffer, (2, 0, 1))

        # Variables actuales
        health = self.game.get_game_variable(vzd.GameVariable.HEALTH)
        armor = self.game.get_game_variable(vzd.GameVariable.ARMOR)
        damage = self.game.get_game_variable(vzd.GameVariable.DAMAGECOUNT)
        killcount = self.game.get_game_variable(vzd.GameVariable.KILLCOUNT)
        weapon_id = self.game.get_game_variable(vzd.GameVariable.SELECTED_WEAPON)
        x = self.game.get_game_variable(vzd.GameVariable.POSITION_X)
        y = self.game.get_game_variable(vzd.GameVariable.POSITION_Y)
        is_alive = health > 0

        shaped_reward = reward

        if is_alive:
            # Cambio de arma
            extra_weapon_change = 0.05 if weapon_id != self.last_weapon_id else 0.0

            # Daño infligido
            damage_delta = damage - self.last_damage
            extra_damage = 0.01 * damage_delta

            # Munición (solo si no cambió de arma)
            new_ammo = self._get_ammo_state()
            if weapon_id == self.last_weapon_id:
                ammo_diff = (new_ammo - self.ammo_state) * self.weapon_state
                pickup = np.clip(ammo_diff, 0, None).sum()
                usage = -np.clip(ammo_diff, None, 0).sum()
                extra_ammo = 0.02 * pickup + 0.01 * usage
            else:
                extra_ammo = 0.0  # no se considera si cambia de arma

            # Salud
            if self.last_health > 0:
                health_gain = max(0, health - self.last_health)
                health_loss = -min(0, health - self.last_health)
            else:
                health_gain = 0
                health_loss = 0
            extra_health = 0.02 * health_gain - 0.01 * health_loss

            # Armadura
            armor_gain = max(0, armor - self.last_armor)
            extra_armor = 0.01 * armor_gain

            # Movimiento
            dist = np.sqrt((x - self.last_pos[0])**2 + (y - self.last_pos[1])**2)
            extra_move = 0.0005 if dist > 3 else -0.0025

            # Kills
            kill_delta = killcount - self.last_kills
            extra_kill = 1.0 * max(0, kill_delta)

            # Sumar todo
            shaped_reward += (
                extra_damage + extra_ammo + extra_health + extra_armor +
                extra_move + extra_kill + extra_weapon_change
            )

        else:
            # Está muerto: solo se cuentan kills si ocurrieron justo antes de morir
            kill_delta = killcount - self.last_kills
            shaped_reward += 1.0 * max(0, kill_delta)
            new_ammo = self._get_ammo_state()  # actualizar de todos modos

        # Actualización de estado
        self.last_health = health
        self.last_armor = armor
        self.last_pos = (x, y)
        self.last_damage = damage
        self.last_kills = killcount
        self.ammo_state = new_ammo
        self.last_weapon_id = weapon_id

        return obs, shaped_reward, done, {}

    def reset(self):
        self.game.new_episode()
        self.game.send_game_command("removebots")
        for _ in range(4):
            self.game.send_game_command("addbot")
        self.last_health = 100
        self.last_armor = 0
        self.last_pos = (
            self.game.get_game_variable(vzd.GameVariable.POSITION_X),
            self.game.get_game_variable(vzd.GameVariable.POSITION_Y)
        )
        self.last_damage = 0
        self.ammo_state = self._get_ammo_state()
        self.weapon_state = self._get_weapon_state()
        self.last_kills = self.game.get_game_variable(vzd.GameVariable.KILLCOUNT)
        self.last_weapon_id = self.game.get_game_variable(vzd.GameVariable.SELECTED_WEAPON)
        state = self.game.get_state()
        return np.zeros(self.observation_space.shape, dtype=np.uint8) if state is None else np.transpose(state.screen_buffer, (2, 0, 1))

    def render(self, mode='rgb_array'):
            # Render del entorno devolviendo la imagen actual (H, W, C)
            state = self.game.get_state()
            if state is None:
                return np.zeros((120, 160, 3), dtype=np.uint8)
            return state.screen_buffer

In [None]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNN(BaseFeaturesExtractor):
    """
    Red convolucional personalizada para extraer características de la pantalla de Doom.
    Arquitectura: 3 capas conv (ReLU) + capa lineal final.
    """
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        
        n_channels = observation_space.shape[0]  # 12 si usas FrameStack(4)
        
        self.cnn = nn.Sequential(
            nn.Conv2d(n_channels, 32, kernel_size=8, stride=4), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(),
            nn.Flatten()
        )

        # Determinar tamaño de la salida de la CNN
        with th.no_grad():
            sample_input = th.zeros(1, *observation_space.shape)
            n_flatten = self.cnn(sample_input).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.ReLU()
        )

    def forward(self, observations: th.Tensor) -> th.Tensor:
        x = observations / 255.0  # Normalización [0,1]
        x = self.cnn(x)
        return self.linear(x)


In [None]:
from stable_baselines3.common.vec_env import SubprocVecEnv, VecFrameStack

# Número de entornos paralelos
num_envs = 8

# FrameStack = 4 (para apilar 4 imágenes consecutivas)
frame_stack_n = 4

# Función para crear un entorno individual (usada por cada subproceso)
def make_env():
    def _init():
        env = DoomShapedEnv(frame_skip=1)  # sin frame_skip ahora
        return env
    return _init

# Crear múltiples entornos paralelos
env_train = SubprocVecEnv([make_env() for _ in range(num_envs)])

# Apilar 4 frames consecutivos (para dar sentido del movimiento)
env_train = VecFrameStack(env_train, n_stack=frame_stack_n, channels_order='first')


In [None]:
from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback

# Listas globales
episode_rewards = []
episode_rewards2 = []
episode_losses = []

class LoggingCallback(BaseCallback):
    def __init__(self, verbose=0, save_path="best_models_inline", reward_threshold=50.0):
        super().__init__(verbose)
        self.current_rewards = None
        self.best_reward = -np.inf
        self.save_path = save_path
        self.reward_threshold = reward_threshold
        os.makedirs(self.save_path, exist_ok=True)

    def _on_training_start(self) -> None:
        self.current_rewards = [0.0] * self.training_env.num_envs
        
    def _on_step(self) -> bool:
        rewards = self.locals["rewards"]
        dones = self.locals["dones"]

        for i in range(len(rewards)):
            self.current_rewards[i] += rewards[i]
            if dones[i]:
                total = self.current_rewards[i]
                episode_rewards2.append(total)
                print(f"Episodio completado: reward = {total:.2f}")

                # Guardar modelo si reward supera el umbral y es el mejor
                if total >= self.reward_threshold and total > self.best_reward:
                    self.best_reward = total
                    filename = f"{self.save_path}/best_model_ep{self.num_timesteps}_rew{total:.1f}".replace(".", "_")
                    self.model.save(filename)
                    print(f"💾 Guardado nuevo mejor modelo con reward = {total:.2f} en {filename}.zip")

                self.current_rewards[i] = 0.0

        # Registrar pérdida (entropía del actor)
        maybe_loss = self.model.logger.name_to_value.get("train/entropy_loss")
        if maybe_loss is not None:
            episode_losses.append(maybe_loss)

        return True

    def _on_rollout_end(self) -> None:
        if "rollout/ep_rew_mean" in self.model.logger.name_to_value:
            rew = self.model.logger.name_to_value["rollout/ep_rew_mean"]
            episode_rewards.append(rew)
            print(f"Recompensa en rollout completado: {rew:.2f}")


# ─── CALLBACK DE CHECKPOINTS CADA 500K PASOS ───────────────────────────────────
checkpoint_callback = CheckpointCallback(
    save_freq=500_000,
    save_path="./checkpoints",
    name_prefix="ppo_doom_phase1"
)

# ─── DEFINICIÓN DEL MODELO PPO CON CNN PERSONALIZADA ──────────────────────────
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=512)
)

model = PPO(
    policy="CnnPolicy",
    env=env_train,
    learning_rate=1e-4,
    n_steps=1024,
    batch_size=1024,
    n_epochs=4,
    ent_coef=0.02,  # ← Aumenta la entropía para explorar más
    policy_kwargs=policy_kwargs,
    verbose=1,
    device=device
)

logging_callback = LoggingCallback(save_path="best_models_inline", reward_threshold=50.0)

model.learn(
    total_timesteps=9_000_000,
    callback=[checkpoint_callback, logging_callback]
)


# ─── GUARDAR MODELO ENTRENADO ─────────────────────────────────────────────────
model.save("ppo_doom_bots_phase1")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

# ─── GRÁFICO DE REWARD POR EPISODIO ────────────────────────────────────────────
plt.subplot(1, 2, 1)
plt.plot(episode_rewards)
plt.title("Recompensa por rollout (no promedio móvil)")
plt.xlabel("Rollout (cada n_steps pasos)")
plt.ylabel("Recompensa")
plt.grid(True)

# ─── GRÁFICO DE ENTROPÍA (PROXY DE PÉRDIDA) ───────────────────────────────────
plt.subplot(1, 2, 2)
plt.plot(episode_losses)
plt.title("Entropía del actor (proxy de pérdida)")
plt.xlabel("Paso de entrenamiento")
plt.ylabel("Entropía")
plt.grid(True)

plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 4))
plt.plot(episode_rewards2)
plt.title("Recompensa por episodio")
plt.xlabel("Episodio")
plt.ylabel("Reward total")
plt.grid(True)
plt.show()


In [None]:
def evaluate_and_record(model, filename="doom_phase1_eval.mp4", num_episodes=5):
    from IPython.display import Video, display
    import time

    env_eval = DummyVecEnv([make_env()])
    env_eval = VecFrameStack(env_eval, n_stack=4, channels_order='first')
    env_eval.venv.render_mode = "rgb_array"
    
    writer = imageio.get_writer(filename, fps=15)  # Puedes ajustar el fps aquí
    rewards = []

    for ep in range(num_episodes):
        obs = env_eval.reset()
        done = [False]
        total_reward = 0.0

        while not done[0]:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env_eval.step(action)
            total_reward += reward[0]

            # Render del entorno
            frame = env_eval.render(mode='rgb_array')
            if frame is not None:
                writer.append_data(frame)

        print(f"Episodio {ep+1}: reward = {total_reward:.2f}")
        rewards.append(total_reward)

    writer.close()
    print("Recompensas por episodio:", rewards)
    display(Video(filename, embed=True))


In [None]:
evaluate_and_record(model, filename="doom_phase1_eval.mp4", num_episodes=5)
