In [1]:
!pip install gym-super-mario-bros==7.4.0 nes-py pyvirtualdisplay
!apt-get update && apt-get install -y xvfb ffmpeg


Collecting gym-super-mario-bros==7.4.0
  Downloading gym_super_mario_bros-7.4.0-py3-none-any.whl.metadata (10 kB)
Collecting nes-py
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.7/77.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl.metadata (943 bytes)
Collecting pyglet<=1.5.21,>=1.4.0 (from nes-py)
  Downloading pyglet-1.5.21-py3-none-any.whl.metadata (7.6 kB)
Downloading gym_super_mario_bros-7.4.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Downloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m


In [2]:
import random
import numpy as np
import cv2
import gym
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from collections import deque
from pyvirtualdisplay import Display

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Configurar dispositivo (GPU si está disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)

# Inicializar un display virtual para render (necesario para guardar video)
display = Display(visible=0, size=(1400, 900))
display.start()


Usando dispositivo: cuda


<pyvirtualdisplay.display.Display at 0x7dd1c54093d0>

In [3]:
# Crear entorno (puede ser "SuperMarioBros-v0" o un nivel específico como "SuperMarioBros-1-1-v0")
env = gym_super_mario_bros.make("SuperMarioBros-v0")
env = JoypadSpace(env, SIMPLE_MOVEMENT)
n_actions = env.action_space.n
print("Número de acciones:", n_actions)


  logger.warn(


Número de acciones: 7


  deprecation(
  deprecation(


In [4]:
def preprocess_frame(frame):
    """Convierte un fotograma RGB a escala de grises 84x84."""
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
    normalized = resized.astype(np.float32) / 255.0
    return normalized  # matriz 84x84

# Ejemplo: obtener el primer estado (apilando 4 veces el fotograma inicial)
obs = env.reset()
state = preprocess_frame(obs)
state_stack = np.stack([state] * 4, axis=0)  # shape = (4, 84, 84)


In [5]:
class DQN(nn.Module):
    def __init__(self, input_channels, num_actions):
        super(DQN, self).__init__()
        # Capas convolucionales
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        # Capa totalmente conectada
        self.fc_input_dim = self._get_conv_output((input_channels, 84, 84))
        self.fc1 = nn.Linear(self.fc_input_dim, 512)
        self.fc2 = nn.Linear(512, num_actions)
    
    def _get_conv_output(self, shape):
        # Función auxiliar para calcular tamaño de salida del bloque conv
        o = torch.zeros(1, *shape)
        o = self.conv1(o)
        o = self.conv2(o)
        o = self.conv3(o)
        return int(np.prod(o.size()))
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Crear redes en línea (policy_net) y target (target_net)
policy_net = DQN(input_channels=4, num_actions=n_actions).to(device)
target_net = DQN(input_channels=4, num_actions=n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()  # Red target en modo evaluación


DQN(
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=7, bias=True)
)

In [6]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        # Se clonan arrays para evitar referencias mutables
        state = np.array(state, copy=False)
        next_state = np.array(next_state, copy=False)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        # Muestra al azar un batch de transiciones
        transitions = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*transitions)
        return np.array(states), actions, rewards, np.array(next_states), dones
    
    def __len__(self):
        return len(self.buffer)

# Inicializar memoria
memory = ReplayBuffer(capacity=200000)


In [7]:
# Hiperparámetros
num_episodes = 800            # número de episodios de entrenamiento
max_steps_per_episode = 5000   # tope de pasos por episodio
batch_size = 32
gamma = 0.99                   # factor de descuento
learning_rate = 1e-4
epsilon_start = 1.0
epsilon_end = 0.1
epsilon_decay = 200000         # pasos para decaer ε
target_update = 10             # episodios para actualizar target_net

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

steps_done = 0
epsilon = epsilon_start

for episode in range(num_episodes):
    obs = env.reset()
    state = preprocess_frame(obs)
    state_stack = np.stack([state]*4, axis=0)  # inicializar stack de 4 frames
    total_reward = 0
    
    for t in range(max_steps_per_episode):
        # ε-greedy: decidir acción
        if random.random() < epsilon:
            action = random.randrange(n_actions)
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state_stack, dtype=torch.float32, device=device).unsqueeze(0)
                q_values = policy_net(state_tensor)
                action = q_values.argmax().item()
        
        # Ejecutar acción en el entorno
        next_obs, reward, done, info = env.step(action)
        next_frame = preprocess_frame(next_obs)
        next_state_stack = np.concatenate([state_stack[1:], next_frame[np.newaxis, ...]], axis=0)
        
        # Almacenar transición en memoria
        memory.push(state_stack, action, reward, next_state_stack, done)
        total_reward += reward
        state_stack = next_state_stack
        
        # Actualizar ε
        steps_done += 1
        epsilon = max(epsilon_end, epsilon_start - steps_done / epsilon_decay)
        
        # Realizar optimización si hay muestras suficientes
        if len(memory) >= batch_size:
            states, actions, rewards, next_states, dones = memory.sample(batch_size)
            
            # Convertir a tensores
            states_v = torch.tensor(states, dtype=torch.float32, device=device)
            next_states_v = torch.tensor(next_states, dtype=torch.float32, device=device)
            actions_v = torch.tensor(actions, dtype=torch.int64, device=device).unsqueeze(1)
            rewards_v = torch.tensor(rewards, dtype=torch.float32, device=device)
            dones_v = torch.tensor(dones, dtype=torch.float32, device=device)
            
            # Calcular Q(s,a) con red online
            q_values = policy_net(states_v).gather(1, actions_v).squeeze(1)
            # Seleccionar acciones con red online para siguiente estado (Double DQN)
            next_actions = policy_net(next_states_v).argmax(dim=1, keepdim=True)
            # Calcular Q target con red target
            next_q_values = target_net(next_states_v).gather(1, next_actions).squeeze(1)
            expected_q_values = rewards_v + gamma * next_q_values * (1 - dones_v)
            
            # Pérdida Huber (Smooth L1) para estabilidad
            loss = F.smooth_l1_loss(q_values, expected_q_values.detach())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if done:
            break
    
    # Actualizar red target cada ciertos episodios
    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())
    
    print(f"Episodio {episode+1}/{num_episodes} - Recompensa: {total_reward:.1f} - ε={epsilon:.3f}")


  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):


Episodio 1/800 - Recompensa: 544.0 - ε=0.975
Episodio 2/800 - Recompensa: 303.0 - ε=0.950
Episodio 3/800 - Recompensa: 304.0 - ε=0.925
Episodio 4/800 - Recompensa: 799.0 - ε=0.900
Episodio 5/800 - Recompensa: 304.0 - ε=0.875
Episodio 6/800 - Recompensa: 303.0 - ε=0.850
Episodio 7/800 - Recompensa: 432.0 - ε=0.825
Episodio 8/800 - Recompensa: 304.0 - ε=0.800
Episodio 9/800 - Recompensa: 432.0 - ε=0.775
Episodio 10/800 - Recompensa: 839.0 - ε=0.750
Episodio 11/800 - Recompensa: 429.0 - ε=0.725
Episodio 12/800 - Recompensa: 1056.0 - ε=0.700
Episodio 13/800 - Recompensa: 304.0 - ε=0.675
Episodio 14/800 - Recompensa: 553.0 - ε=0.650
Episodio 15/800 - Recompensa: 963.0 - ε=0.625
Episodio 16/800 - Recompensa: 972.0 - ε=0.600
Episodio 17/800 - Recompensa: 304.0 - ε=0.575
Episodio 18/800 - Recompensa: 979.0 - ε=0.560
Episodio 19/800 - Recompensa: 432.0 - ε=0.535
Episodio 20/800 - Recompensa: 808.0 - ε=0.510
Episodio 21/800 - Recompensa: 1602.0 - ε=0.485
Episodio 22/800 - Recompensa: 1941.0 - ε=

In [8]:
# Crear un nuevo entorno para reproducción
video_env = gym_super_mario_bros.make("SuperMarioBros-v0")
video_env = JoypadSpace(video_env, SIMPLE_MOVEMENT)

frames = []
obs = video_env.reset()
state = preprocess_frame(obs)
state_stack = np.stack([state]*4, axis=0)

done = False
while not done:
    with torch.no_grad():
        state_tensor = torch.tensor(state_stack, dtype=torch.float32, device=device).unsqueeze(0)
        action = policy_net(state_tensor).argmax().item()  # siempre la mejor acción
    next_obs, reward, done, info = video_env.step(action)
    frame = video_env.render(mode='rgb_array')
    frames.append(frame)
    
    next_frame = preprocess_frame(next_obs)
    state_stack = np.concatenate([state_stack[1:], next_frame[np.newaxis, ...]], axis=0)

video_env.close()

# Guardar video con OpenCV
height, width, layers = frames[0].shape
video_name = 'mario_final.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video = cv2.VideoWriter(video_name, fourcc, 30, (width, height))

for f in frames:
    # Convertir RGB a BGR para OpenCV
    video.write(cv2.cvtColor(f, cv2.COLOR_RGB2BGR))
video.release()
print("Video guardado como", video_name)


  logger.warn(
  deprecation(
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


Video guardado como mario_final.mp4


In [9]:
import imageio
from IPython.display import HTML
from base64 import b64encode

# Crear entorno limpio SOLO para grabar video
video_env = gym_super_mario_bros.make("SuperMarioBros-v0")
video_env = JoypadSpace(video_env, SIMPLE_MOVEMENT)

frames = []
obs = video_env.reset()
state = preprocess_frame(obs)
state_stack = np.stack([state]*4, axis=0)

done = False
while not done:
    with torch.no_grad():
        state_tensor = torch.tensor(state_stack, dtype=torch.float32, device=device).unsqueeze(0)
        action = policy_net(state_tensor).argmax().item()
    
    next_obs, reward, done, info = video_env.step(action)

    # ⚠️ Captura directa desde la pantalla del entorno
    if hasattr(video_env.unwrapped, "screen"):
        frame = video_env.unwrapped.screen.copy()
        frames.append(frame)

    next_frame = preprocess_frame(next_obs)
    state_stack = np.concatenate([state_stack[1:], next_frame[np.newaxis, ...]], axis=0)

video_env.close()

# Guardar video
video_name = "mario_final.mp4"
imageio.mimsave(video_name, frames, fps=30)
print("✅ Video guardado como:", video_name)

# Mostrar el video directamente
def display_video(filename):
    with open(filename, "rb") as f:
        video_data = f.read()
    data_url = "data:video/mp4;base64," + b64encode(video_data).decode()
    return HTML(f"""
        <video width="640" height="480" controls autoplay loop>
            <source src="{data_url}" type="video/mp4">
        </video>
    """)

display_video(video_name)


✅ Video guardado como: mario_final.mp4
