# Monografia Final
## Redes Neuronales - FIUBA
### Alumno: Julian Stejman

> Implementacion simple con Stable Baselines3 PPO (sin LSTM).

Flujo simple: juego -> red neuronal -> entrenamiento -> evaluacion.

Se usa PPO de Stable Baselines3 con una CNN y stacking de 4 frames.

In [None]:
%pip install "numpy<2.0" stable-baselines3 gym_super_mario_bros nes-py shimmy
import numpy as np
import torch
import torch.nn as nn
from nes_py.wrappers import JoypadSpace
import gym
from gym import spaces
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from gym.wrappers import GrayScaleObservation, ResizeObservation
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage
from stable_baselines3.common.evaluation import evaluate_policy

Note: you may need to restart the kernel to use updated packages.


In [9]:
class ClipObservation(gym.ObservationWrapper):
    """Clips observation values to valid uint8 range [0, 255]"""
    def __init__(self, env):
        super().__init__(env)
        self.observation_space = spaces.Box(
            low=0, 
            high=255, 
            shape=env.observation_space.shape, 
            dtype=np.uint8
        )
    
    def observation(self, observation):
        return np.clip(observation, 0, 255).astype(np.uint8)

def make_env():
    env = gym_super_mario_bros.make('SuperMarioBros-v0')
    env = JoypadSpace(env, RIGHT_ONLY)
    env = GrayScaleObservation(env, keep_dim=True)
    env = ResizeObservation(env, shape=(84, 84))
    env = ClipObservation(env)  # Add clipping wrapper
    return env

env = DummyVecEnv([make_env])
env = VecFrameStack(env, n_stack=4)
env = VecTransposeImage(env)

OverflowError: Python integer 1024 out of bounds for uint8

In [None]:
class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=256):
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
        )
        with torch.no_grad():
            sample = torch.as_tensor(observation_space.sample()[None]).float()
            n_flatten = self.cnn(sample).shape[1]
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.ReLU(),
        )

    def forward(self, observations):
        return self.linear(self.cnn(observations))

# Instancia explicita de la CNN personalizada
custom_cnn = CustomCNN(env.observation_space)

policy_kwargs = {
    "features_extractor_class": CustomCNN,
    "features_extractor_kwargs": {"features_dim": 256},
}

model_path = "ppo_mario_sb3"
try:
    model = PPO.load(model_path, env=env)
    print("Modelo cargado desde disco.")
except Exception:
    model = PPO(
        "CnnPolicy",
        env,
        policy_kwargs=policy_kwargs,
        verbose=1,
        n_steps=2048,
        batch_size=64,
        learning_rate=2.5e-4,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        device="auto",
    )

    # Ajusta total_timesteps segun tiempo disponible
    model.learn(total_timesteps=200_000)
    model.save(model_path)
    print("Modelo entrenado y guardado.")

In [None]:
eval_env = DummyVecEnv([make_env])
eval_env = VecFrameStack(eval_env, n_stack=4)
eval_env = VecTransposeImage(eval_env)

mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=5,
    deterministic=True,
 )

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")