# Ver jugar a un modelo de Super Mario preentrenado

https://github.com/jorgecasase/mariobrosplay-windows-gym-RL

Este notebook permite cargar y observar cómo juega un modelo de Super Mario preentrenado. Utiliza un entorno personalizado basado en OpenAI Gym, junto con varias optimizaciones específicas para juegos retro.

## Requisitos
1. Haber creado el entorno virutal con conda

2. Un modelo preentrenado debe estar disponible en el archivo `trained_mario.chkpt`.
   - Si no tienes un modelo preentrenado, puedes entrenarlo siguiendo el tutorial en el siguiente enlace:
     [Mario RL con PyTorch - Entrenamiento en 400 episodios](https://github.com/pedroconcejero/deep_learning_2024/blob/main/mario_RL_pytorch_tutorial_400_episodes_save_every_1e4.ipynb)
     - o puedes usar el de github
2. Asegúrate de tener instaladas todas las dependencias necesarias, incluidas `gym`, `gym_super_mario_bros`, y `torch`.

Una vez cumplidos los requisitos, puedes cargar el modelo y observar su rendimiento en el entorno.

Nota: este notebook está adaptado para ser ejecutado en windows nativo, no funciona en colab ni en macos.

## clases de juego Neural, MetricLogger, Mario, wrapper

In [1]:
from torch import nn
import copy
class MarioNet(nn.Module):
    '''mini cnn structure
    input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
    '''
    def __init__(self, input_dim, output_dim):
        super().__init__()
        c, h, w = input_dim

        if h != 84:
            raise ValueError(f"Expecting input height: 84, got: {h}")
        if w != 84:
            raise ValueError(f"Expecting input width: 84, got: {w}")

        self.online = nn.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)
        )

        self.target = copy.deepcopy(self.online)

        # Q_target parameters are frozen.
        for p in self.target.parameters():
            p.requires_grad = False

    def forward(self, input, model):
        if model == 'online':
            return self.online(input)
        elif model == 'target':
            return self.target(input)


import numpy as np
import time, datetime
import matplotlib.pyplot as plt
import torch
import random
from pathlib import Path
import gym
import datetime
from skimage import transform
from collections import deque
from gym.spaces import Box

class MetricLogger():
    def __init__(self, save_dir):
        self.save_log = save_dir / "log"
        with open(self.save_log, "w") as f:
            f.write(
                f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
                f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
                f"{'TimeDelta':>15}{'Time':>20}\n"
            )
        self.ep_rewards_plot = save_dir / "reward_plot.jpg"
        self.ep_lengths_plot = save_dir / "length_plot.jpg"
        self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
        self.ep_avg_qs_plot = save_dir / "q_plot.jpg"

        # History metrics
        self.ep_rewards = []
        self.ep_lengths = []
        self.ep_avg_losses = []
        self.ep_avg_qs = []

        # Moving averages, added for every call to record()
        self.moving_avg_ep_rewards = []
        self.moving_avg_ep_lengths = []
        self.moving_avg_ep_avg_losses = []
        self.moving_avg_ep_avg_qs = []

        # Current episode metric
        self.init_episode()

        # Timing
        self.record_time = time.time()


    def log_step(self, reward, loss, q):
        self.curr_ep_reward += reward
        self.curr_ep_length += 1
        if loss:
            self.curr_ep_loss += loss
            self.curr_ep_q += q
            self.curr_ep_loss_length += 1

    def log_episode(self):
        "Mark end of episode"
        self.ep_rewards.append(self.curr_ep_reward)
        self.ep_lengths.append(self.curr_ep_length)
        if self.curr_ep_loss_length == 0:
            ep_avg_loss = 0
            ep_avg_q = 0
        else:
            ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
            ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
        self.ep_avg_losses.append(ep_avg_loss)
        self.ep_avg_qs.append(ep_avg_q)

        self.init_episode()

    def init_episode(self):
        self.curr_ep_reward = 0.0
        self.curr_ep_length = 0
        self.curr_ep_loss = 0.0
        self.curr_ep_q = 0.0
        self.curr_ep_loss_length = 0

    def record(self, episode, epsilon, step):
        mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
        mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
        mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
        mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
        self.moving_avg_ep_rewards.append(mean_ep_reward)
        self.moving_avg_ep_lengths.append(mean_ep_length)
        self.moving_avg_ep_avg_losses.append(mean_ep_loss)
        self.moving_avg_ep_avg_qs.append(mean_ep_q)


        last_record_time = self.record_time
        self.record_time = time.time()
        time_since_last_record = np.round(self.record_time - last_record_time, 3)

        print(
            f"Episode {episode} - "
            f"Step {step} - "
            f"Epsilon {epsilon} - "
            f"Mean Reward {mean_ep_reward} - "
            f"Mean Length {mean_ep_length} - "
            f"Mean Loss {mean_ep_loss} - "
            f"Mean Q Value {mean_ep_q} - "
            f"Time Delta {time_since_last_record} - "
            f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
        )

        with open(self.save_log, "a") as f:
            f.write(
                f"{episode:8d}{step:8d}{epsilon:10.3f}"
                f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
                f"{time_since_last_record:15.3f}"
                f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
            )

        for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
            plt.plot(getattr(self, f"moving_avg_{metric}"))
            plt.savefig(getattr(self, f"{metric}_plot"))
            plt.clf()
class Mario:
    def __init__(self, state_dim, action_dim, save_dir, checkpoint=None):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.memory = deque(maxlen=100000)
        self.batch_size = 32

        self.exploration_rate = 1
        self.exploration_rate_decay = 0.99999975
        self.exploration_rate_min = 0.1
        self.gamma = 0.9

        self.curr_step = 0
        self.burnin = 1e5  # min. experiences before training
        self.learn_every = 3   # no. of experiences between updates to Q_online
        self.sync_every = 1e4   # no. of experiences between Q_target & Q_online sync

        self.save_every = 5e5   # no. of experiences between saving Mario Net
        self.save_dir = save_dir

        self.use_cuda = torch.cuda.is_available()

        # Mario's DNN to predict the most optimal action - we implement this in the Learn section
        self.net = MarioNet(self.state_dim, self.action_dim).float()
        if self.use_cuda:
            self.net = self.net.to(device='cuda')
        if checkpoint:
            self.load(checkpoint)

        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
        self.loss_fn = torch.nn.SmoothL1Loss()


    def act(self, state):
        """
        Given a state, choose an epsilon-greedy action and update value of step.

        Inputs:
        state(LazyFrame): A single observation of the current state, dimension is (state_dim)
        Outputs:
        action_idx (int): An integer representing which action Mario will perform
        """
        # EXPLORE
        if np.random.rand() < self.exploration_rate:
            action_idx = np.random.randint(self.action_dim)

        # EXPLOIT
        else:
            state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
            state = state.unsqueeze(0)
            action_values = self.net(state, model='online')
            action_idx = torch.argmax(action_values, axis=1).item()

        # decrease exploration_rate
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

        # increment step
        self.curr_step += 1
        return action_idx

    def cache(self, state, next_state, action, reward, done):
        """
        Store the experience to self.memory (replay buffer)

        Inputs:
        state (LazyFrame),
        next_state (LazyFrame),
        action (int),
        reward (float),
        done(bool))
        """
        state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
        action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
        reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
        done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])

        self.memory.append( (state, next_state, action, reward, done,) )


    def recall(self):
        """
        Retrieve a batch of experiences from memory
        """
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()


    def td_estimate(self, state, action):
        current_Q = self.net(state, model='online')[np.arange(0, self.batch_size), action] # Q_online(s,a)
        return current_Q


    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        next_state_Q = self.net(next_state, model='online')
        best_action = torch.argmax(next_state_Q, axis=1)
        next_Q = self.net(next_state, model='target')[np.arange(0, self.batch_size), best_action]
        return (reward + (1 - done.float()) * self.gamma * next_Q).float()


    def update_Q_online(self, td_estimate, td_target) :
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()


    def sync_Q_target(self):
        self.net.target.load_state_dict(self.net.online.state_dict())


    def learn(self):
        if self.curr_step % self.sync_every == 0:
            self.sync_Q_target()

        if self.curr_step % self.save_every == 0:
            self.save()

        if self.curr_step < self.burnin:
            return None, None

        if self.curr_step % self.learn_every != 0:
            return None, None

        # Sample from memory
        state, next_state, action, reward, done = self.recall()

        # Get TD Estimate
        td_est = self.td_estimate(state, action)

        # Get TD Target
        td_tgt = self.td_target(reward, next_state, done)

        # Backpropagate loss through Q_online
        loss = self.update_Q_online(td_est, td_tgt)

        return (td_est.mean().item(), loss)


    def save(self):
        save_path = self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
        torch.save(
            dict(
                model=self.net.state_dict(),
                exploration_rate=self.exploration_rate
            ),
            save_path
        )
        print(f"MarioNet saved to {save_path} at step {self.curr_step}")


    def load(self, load_path):
        if not load_path.exists():
            raise ValueError(f"{load_path} does not exist")

        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
        exploration_rate = ckp.get('exploration_rate')
        state_dict = ckp.get('model')

        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
        self.net.load_state_dict(state_dict)
        self.exploration_rate = exploration_rate

class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        resize_obs = transform.resize(observation, self.shape)
        # cast float back to uint8
        resize_obs *= 255
        resize_obs = resize_obs.astype(np.uint8)
        return resize_obs


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            # Accumulate reward and repeat the same action
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


## Importación de modulos

In [2]:
# Importación de módulos necesarios
import random, datetime
from pathlib import Path

# Importación de librerías para Gym y Super Mario
import gym
import gym_super_mario_bros
from gym.wrappers import FrameStack, GrayScaleObservation, TransformObservation
from nes_py.wrappers import JoypadSpace

## Creación del entorno

In [3]:
# Creación del entorno base de Super Mario
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

# Configuración del espacio de acciones (Joypad)
env = JoypadSpace(
    env,
    [['right'],  # Acción 1: caminar a la derecha
    ['right', 'A']]  # Acción 2: caminar a la derecha y saltar
)

## Aplicación de wrappers

In [4]:
# Aplicación de wrappers para modificar el entorno
env = SkipFrame(env, skip=4)  # Saltar frames para mejorar el rendimiento
env = GrayScaleObservation(env, keep_dim=False)  # Convertir observaciones a escala de grises
env = ResizeObservation(env, shape=84)  # Redimensionar observaciones a 84x84 píxeles
env = TransformObservation(env, f=lambda x: x / 255.)  # Normalizar observaciones
env = FrameStack(env, num_stack=4)  # Apilar 4 frames consecutivos

# Reiniciar el entorno para comenzar
env.reset()

<gym.wrappers.frame_stack.LazyFrames at 0x195fe67bac0>

## Cargar modelo preentrenado

In [5]:
# Configuración del directorio de guardado para puntos de control y métricas
save_dir = Path('checkpoints') / datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
save_dir.mkdir(parents=True)  # Crear el directorio si no existe

# Inicialización del agente Mario con un punto de control preentrenado **AQUI CARGAS TU MODELO**
checkpoint = Path('trained_mario.chkpt') #<-- AQUI IMPORTA TU MODELO
mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir, checkpoint=checkpoint)

# Fijar la tasa de exploración al mínimo
mario.exploration_rate = mario.exploration_rate_min

# Inicialización del registro de métricas
logger = MetricLogger(save_dir)

  ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))


Loading model at trained_mario.chkpt with exploration rate 0.1


## Verlo jugar

In [6]:
# Número de episodios de entrenamiento
episodes = 100

# Bucle de entrenamiento
for e in range(episodes):

    # Reiniciar el entorno y obtener el estado inicial
    state = env.reset()

    while True:
        # Renderizar el entorno
        env.render()

        # Elegir una acción con el agente
        action = mario.act(state)

        # Ejecutar la acción en el entorno
        next_state, reward, done, info = env.step(action)

        # Almacenar la transición en la memoria del agente
        mario.cache(state, next_state, action, reward, done)

        # Registrar métricas del paso
        logger.log_step(reward, None, None)

        # Actualizar el estado actual
        state = next_state

        # Salir del bucle si el juego termina o Mario alcanza la meta
        if done or info['flag_get']:
            break

    # Registrar métricas del episodio
    logger.log_episode()

    # Guardar métricas cada 20 episodios
    if e % 20 == 0:
        logger.record(
            episode=e,
            epsilon=mario.exploration_rate,
            step=mario.curr_step
        )

  state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


Episode 0 - Step 156 - Epsilon 0.1 - Mean Reward 636.0 - Mean Length 156.0 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 4.069 - Time 2024-12-12T02:19:12


  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


Episode 20 - Step 5167 - Epsilon 0.1 - Mean Reward 1184.048 - Mean Length 246.048 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 101.049 - Time 2024-12-12T02:20:53


  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


Episode 40 - Step 9586 - Epsilon 0.1 - Mean Reward 1177.146 - Mean Length 233.805 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 89.345 - Time 2024-12-12T02:22:22


  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


Episode 60 - Step 13338 - Epsilon 0.1 - Mean Reward 1140.246 - Mean Length 218.656 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 75.712 - Time 2024-12-12T02:23:38


  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


Episode 80 - Step 17142 - Epsilon 0.1 - Mean Reward 1101.889 - Mean Length 211.63 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 76.388 - Time 2024-12-12T02:24:54


  return (self.ram[0x86] - self.ram[0x071c]) % 256
  done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])


<Figure size 640x480 with 0 Axes>