In [None]:
import pygame
import numpy as np
import random

# Configuraciones
CELL_SIZE = 20  
MAZE_SIZE = 35  
WIDTH = MAZE_SIZE * CELL_SIZE
HEIGHT = MAZE_SIZE * CELL_SIZE
FPS = 60

# Colores
COLOR_WALL = (20, 80, 20)       
COLOR_PATH = (230, 220, 200)    
COLOR_AGENT = (255, 50, 50)     
COLOR_GOAL = (255, 215, 0)      
COLOR_GOAL_LOCKED = (100, 100, 50) 
COLOR_CHECKPOINT = (0, 100, 255) 
COLOR_CHECKED = (150, 150, 150)  

class HedgeMazeEnv:
    def __init__(self, size=35):
        self.size = size
        self.screen = None
        self.clock = None
        
        self.maze = np.ones((self.size, self.size), dtype=int)
        self._generate_maze(1, 1)
        self._add_multiple_paths(40) 
        
        self.goal_pos = [self.size - 2, self.size - 2]
        self.maze[self.goal_pos[0], self.goal_pos[1]] = 0

        s = self.size
        potential_cp = [(1, s-2), (s-2, 1), (s//2, s//2), (s//4, s-s//4)]
        self.checkpoints = []
        for r, c in potential_cp:
            if self.maze[r, c] == 1:
                for dr in [-1, 0, 1]:
                    for dc in [-1, 0, 1]:
                        if self.maze[r+dr, c+dc] == 0:
                            r, c = r+dr, c+dc
                            break
            self.checkpoints.append([r, c])

        self.n_states = (self.size, self.size, 16) 
        self.n_actions = 4  
        
        self.reset()

    def _generate_maze(self, x, y):
        self.maze[x, y] = 0
        directions = [(0, 2), (0, -2), (2, 0), (-2, 0)]
        random.shuffle(directions)
        for dx, dy in directions:
            nx, ny = x + dx, y + dy
            if 0 < nx < self.size-1 and 0 < ny < self.size-1 and self.maze[nx, ny] == 1:
                self.maze[x + dx//2, y + dy//2] = 0
                self._generate_maze(nx, ny)

    def _add_multiple_paths(self, num_walls_to_remove):
        count = 0
        while count < num_walls_to_remove:
            r = random.randint(1, self.size - 2)
            c = random.randint(1, self.size - 2)
            if self.maze[r, c] == 1:
                self.maze[r, c] = 0
                count += 1

    def sample(self):
        return random.randint(0, self.n_actions - 1)

    def reset(self):
        self.agent_pos = [1, 1]
        self.visited_checkpoints = [False] * len(self.checkpoints)
        self.steps = 0
        self.max_steps = 1000 
        return self._get_obs()

    def _get_obs(self):
        
        cp_id = sum([1 << i for i, visited in enumerate(self.visited_checkpoints) if visited])
        return (self.agent_pos[0], self.agent_pos[1], cp_id)

    def step(self, action):
        moves = [(-1, 0), (1, 0), (0, -1), (0, 1)] 
        dr, dc = moves[action]
        new_pos = [self.agent_pos[0] + dr, self.agent_pos[1] + dc]

        reward = -0.01 
        done = False

        if self.maze[new_pos[0], new_pos[1]] == 0:
            self.agent_pos = new_pos
            for i, cp in enumerate(self.checkpoints):
                if self.agent_pos == cp and not self.visited_checkpoints[i]:
                    self.visited_checkpoints[i] = True
                    reward = 5.0  # Subimos el premio por checkpoint 
        else:
            reward = -0.05 

        if self.agent_pos == self.goal_pos:
            if all(self.visited_checkpoints):
                reward = 50.0 #  premio final
                done = True
            else:
                pass
        
        self.steps += 1
        if self.steps >= self.max_steps:
            done = True

        return self._get_obs(), reward, done

    def render(self):
        if self.screen is None:
            pygame.init()
            self.screen = pygame.display.set_mode((WIDTH, HEIGHT))
            pygame.display.set_caption("Bush Maze RL - All Checkpoints")
            self.clock = pygame.time.Clock()

        pygame.event.pump() 
        self.screen.fill(COLOR_PATH)

        for r in range(self.size):
            for c in range(self.size):
                rect = pygame.Rect(c * CELL_SIZE, r * CELL_SIZE, CELL_SIZE, CELL_SIZE)
                if self.maze[r, c] == 1:
                    pygame.draw.rect(self.screen, COLOR_WALL, rect)
                
        for i, cp in enumerate(self.checkpoints):
            color = COLOR_CHECKED if self.visited_checkpoints[i] else COLOR_CHECKPOINT
            cp_rect = pygame.Rect(cp[1] * CELL_SIZE + 4, cp[0] * CELL_SIZE + 4, CELL_SIZE - 8, CELL_SIZE - 8)
            pygame.draw.rect(self.screen, color, cp_rect, border_radius=3)

        goal_rect = pygame.Rect(self.goal_pos[1] * CELL_SIZE, self.goal_pos[0] * CELL_SIZE, CELL_SIZE, CELL_SIZE)
        current_goal_color = COLOR_GOAL if all(self.visited_checkpoints) else COLOR_GOAL_LOCKED
        pygame.draw.ellipse(self.screen, current_goal_color, goal_rect)

        agent_rect = pygame.Rect(self.agent_pos[1] * CELL_SIZE, self.agent_pos[0] * CELL_SIZE, CELL_SIZE, CELL_SIZE)
        pygame.draw.rect(self.screen, COLOR_AGENT, agent_rect, border_radius=5)

        pygame.display.flip()
        self.clock.tick(FPS)

    def close(self):
        if self.screen is not None:
            pygame.quit()
            self.screen = None



In [3]:
env = HedgeMazeEnv()
state_space = env.n_states
action_space = env.n_actions
    
# CAMBIO 5: Más episodios y decaimiento más lento porque el problema es mucho más difícil
numeroepisodios = 5000  
learning_rate = 0.7          
max_steps = 1000              
gamma = 0.95                 
max_epsilon = 1.0            
min_epsilon = 0.05           
decay_rate = 0.001 # Baja más despacio para explorar más

Qtable = np.zeros((*state_space, action_space))

for episodio in range(numeroepisodios):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episodio) 
    state = env.reset()
    done = False

    for step in range(max_steps):
        if random.uniform(0, 1) > epsilon:
            max_q = np.max(Qtable[state])
            best_actions = [a for a in range(action_space) if Qtable[state][a] == max_q]
            action = random.choice(best_actions)
        else:
            action = env.sample()
        
        new_state, reward, done = env.step(action)

        Qtable[state + (action,)] = Qtable[state + (action,)] + learning_rate * (
            reward + gamma * np.max(Qtable[new_state]) - Qtable[state + (action,)]
        )

        if episodio >= numeroepisodios - 5:
            env.render()

        if done:
            break
            
        state = new_state
        
    if (episodio + 1) % 500 == 0:
        print(f"Episodio {episodio + 1}/{numeroepisodios} completado.")

env.close() 
print("Entrenamiento finalizado.")

Episodio 500/5000 completado.
Episodio 1000/5000 completado.
Episodio 1500/5000 completado.
Episodio 2000/5000 completado.
Episodio 2500/5000 completado.
Episodio 3000/5000 completado.
Episodio 3500/5000 completado.
Episodio 4000/5000 completado.
Episodio 4500/5000 completado.
Episodio 5000/5000 completado.
Entrenamiento finalizado.
