In [94]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import ppo

In [230]:
class Maze(gym.Env):
    def __init__(self, size: int) -> None:
        super().__init__()
        self.action_space = gym.spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(low=0, high=size, shape=(2,), dtype=int)
        self.size = size
        self.maze = (np.random.uniform(0, 1, (size, size)) > 0.8).astype(int)
        self.start_point = (0, 0)
        self.end_point = (size - 1, size - 1)
        self.maze[self.start_point] = 0
        self.maze[self.end_point] = 0
        self.current_point = self.start_point
        self.action_convertissor = {0: (0, 1), 1: (0, -1), 2: (1, 0), 3: (-1, 0)}

    def reset(self, seed=None, config=None):
        self.current_point = self.start_point
        return self.current_point, {}

    def step(self, action):
        next_point = (
            self.current_point[0] + self.action_convertissor[int(action)][0],
            self.current_point[1] + self.action_convertissor[int(action)][1],
        )
        if next_point[0] < 0 or next_point[0] >= self.size or next_point[1] < 0 or next_point[1] >= self.size:
            return self.current_point, -1, False, False, {}
        if self.maze[next_point]:
            return self.current_point, -1, False, False, {}
        if next_point == self.end_point:
            return next_point, 0, True, True, {}
        self.current_point = next_point
        return next_point, -1, False, False, {}


In [244]:
from pygame import ver
from tqdm import tqdm
from gymnasium.wrappers import TimeLimit

def run_maze(env, learn_timesteps=10000, max_episode_steps=100):
    env = TimeLimit(env, max_episode_steps=max_episode_steps)
    amount_done = 0
    steps = 0
    for _ in tqdm(range(10)):
        obs, _ = env.reset()
        model = ppo.PPO("MlpPolicy", env)
        model.learn(total_timesteps=learn_timesteps)
        obs, _ = env.reset()
        for step in range(100):
            action, _states = model.predict(obs)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            if done:
                break
        if terminated:
            amount_done += 1
            steps += step
            print("done in", step, "steps")
    print(amount_done/10, steps/amount_done if amount_done else 0)
    return amount_done, steps

In [246]:
maze_size = 8
env = Maze(maze_size)
env.reset()
print(env.maze)
amount_done_steps = [run_maze(env, (maze_size ** 4 * 2), i) for i in range(50, 110, 10)]


[[0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 1 1]
 [0 1 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0]]


100%|██████████| 10/10 [01:07<00:00,  6.80s/it]


0.0 0


100%|██████████| 10/10 [01:06<00:00,  6.68s/it]


0.0 0


100%|██████████| 10/10 [01:06<00:00,  6.68s/it]


0.0 0


100%|██████████| 10/10 [01:06<00:00,  6.67s/it]


0.0 0


 70%|███████   | 7/10 [00:45<00:19,  6.53s/it]

done in 74 steps


100%|██████████| 10/10 [01:05<00:00,  6.56s/it]


0.1 74.0


 50%|█████     | 5/10 [00:32<00:32,  6.55s/it]

done in 79 steps


 90%|█████████ | 9/10 [00:59<00:06,  6.57s/it]

done in 54 steps


100%|██████████| 10/10 [01:05<00:00,  6.57s/it]

0.2 66.5





In [241]:
amount_done_steps

[(0.07, 2.74),
 (0.060000000000000005, 2.5500000000000003),
 (0.060000000000000005, 2.15),
 (0.07, 2.32),
 (0.07, 2.6900000000000004),
 (0.08, 2.5700000000000003),
 (0.03, 0.9199999999999999),
 (0.03, 1.7400000000000002),
 (0.07, 2.0700000000000003),
 (0.09999999999999999, 2.98)]