In [1]:
# !pip install "stable-baselines3[extra]" gymnasium minigrid torch torchvision imageio[ffmpeg]
# !pip install "gymnasium[other]"

import os, shutil, glob, subprocess, random, re
import gymnasium as gym
import numpy as np
import minigrid  # noqa
from minigrid.minigrid_env import MiniGridEnv
from minigrid.core.grid import Grid
from minigrid.core.world_object import Wall, Goal
from minigrid.wrappers import RGBImgObsWrapper, ImgObsWrapper, FullyObsWrapper
from gymnasium.spaces import Text
from gymnasium import ActionWrapper, ObservationWrapper, RewardWrapper, spaces
from gymnasium.wrappers import RecordVideo
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from minigrid.core.actions import Actions
from minigrid.core.world_object import Goal
from collections import deque
import torch 

ENV_ID    = "MiniGrid-FourRooms-v0"   # 19x19 기본
VIDEO_DIR = "videos"
OUT_FILE  = "learning_x2.mp4"

print('=== GPU 사용 가능 여부 확인 ==='); 
print(f'CUDA 사용 가능: {torch.cuda.is_available()}'); 
print(f'CUDA 버전: {torch.version.cuda if torch.cuda.is_available() else "N/A"}')
print(f'사용 가능한 GPU 개수: {torch.cuda.device_count()}')




2025-09-23 14:11:02.220652: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-23 14:11:02.440150: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-23 14:11:03.119069: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


=== GPU 사용 가능 여부 확인 ===
CUDA 사용 가능: True
CUDA 버전: 12.8
사용 가능한 GPU 개수: 1


In [2]:
class FourRoomsSmall(MiniGridEnv):
    """
    11x11 기본. 내부 십자벽 + 네 곳 통로. 시작/목표 고정 가능.
    """
    def __init__(self, size=11, agent_start_pos=(1,1), goal_pos=(9,9),
                 start_dir=0, max_steps=200, **kwargs):
        assert size % 2 == 1 and size >= 9, "홀수, 최소 9 권장"
        self.size = size
        self._agent_start_pos = agent_start_pos
        self._goal_pos = goal_pos
        self._start_dir = start_dir
        mission_space = Text(max_length=50)
        super().__init__(mission_space=mission_space,
            grid_size=size, max_steps=max_steps,
            see_through_walls=False, **kwargs)

    def _gen_grid(self, width, height):
        # 빈 그리드 생성 + 외벽
        self.grid = Grid(width, height)
        self.grid.wall_rect(0, 0, width, height)

        # 십자 내부벽
        midx, midy = width // 2, height // 2
        for y in range(1, height-1):
            if y != midy:
                self.grid.set(midx, y, Wall())
        for x in range(1, width-1):
            if x != midx:
                self.grid.set(x, midy, Wall())

        # 네 통로(문) 위치: 필요시 수정
        openings = [
            (midx, 2),                  # 위쪽 통로
            (midx, height - 3),         # 아래쪽 통로
            (2, midy),                  # 왼쪽 통로
            (width - 3, midy),          # 오른쪽 통로
        ]
        for x, y in openings:
            self.grid.set(x, y, None)

        # 목표 배치
        if self._goal_pos is not None:
            gx, gy = self._goal_pos
            self.put_obj(Goal(), gx, gy)
        else:
            self.place_obj(Goal(), top=(midx+1, 1), size=(width-midx-2, height-2))

        # 에이전트 시작 배치
        if self._agent_start_pos is not None:
            sx, sy = self._agent_start_pos
            self.agent_pos = np.array([sx, sy], dtype=np.int64)
            self.agent_dir = self._start_dir
        else:
            self.place_agent(top=(1, 1), size=(midx-2, height-2))

        self.mission = "reach the goal"

# ---------- 0) 고정 시작/목표 + 고정 레이아웃 ----------
class FixedStartGoal(ObservationWrapper):
    """항상 같은 레이아웃/시작/목표로 초기화."""
    def __init__(self, env, start=(1,1), goal=(17,17), layout_seed=123, start_dir=0):
        super().__init__(env)
        self.start = tuple(start)
        self.goal = tuple(goal)
        self.layout_seed = int(layout_seed)
        self.start_dir = int(start_dir)

    # 필수: 관측을 그대로 통과
    def observation(self, obs):
        return obs

    def _is_free(self, pos):
        x, y = pos
        g = self.unwrapped.grid.get(x, y)
        return (g is None) or (getattr(g, "type", None) == "goal")

    def _nearest_free(self, target):
        W, H = self.unwrapped.grid.width, self.unwrapped.grid.height
        q = deque([target]); seen = {target}
        while q:
            x, y = q.popleft()
            if 0 <= x < W and 0 <= y < H and self._is_free((x,y)):
                return (x, y)
            for dx,dy in [(1,0),(-1,0),(0,1),(0,-1)]:
                nx, ny = x+dx, y+dy
                if 0 <= nx < W and 0 <= ny < H and (nx,ny) not in seen:
                    seen.add((nx,ny)); q.append((nx,ny))
        return target

    def reset(self, **kwargs):
        # 레이아웃 고정 seed로 초기화
        obs, info = self.env.reset(seed=self.layout_seed)

        # 목표 강제 배치
        gx, gy = self._nearest_free(self.goal)
        grid = self.unwrapped.grid
        W, H = grid.width, grid.height
        for x in range(W):
            for y in range(H):
                obj = grid.get(x,y)
                if obj is not None and getattr(obj, "type", None) == "goal":
                    grid.set(x,y,None)
        grid.set(gx, gy, Goal())

        # 시작 위치/방향 고정
        sx, sy = self._nearest_free(self.start)
        self.unwrapped.agent_pos = np.array([sx, sy], dtype=np.int64)
        self.unwrapped.agent_dir = self.start_dir

        # 관측 재계산
        obs = self.unwrapped.gen_obs()
        return obs, info

class StepPenalty(RewardWrapper):
    """스텝당 -p, 목표 도달 시 +1.0. 기본값 p=0.01."""
    def __init__(self, env, step_penalty=0.01):
        super().__init__(env)
        self.step_penalty = float(step_penalty)
    def reward(self, reward):
        # MiniGrid의 성공 보상(1.0)은 유지, 그 외엔 -step_penalty
        return reward if reward > 0 else -self.step_penalty

# ---------- 1) 내비게이션 전용 액션 ----------
class OnlyNavActions(ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.action_space = spaces.Discrete(3)  # 0:left, 1:right, 2:forward
    def action(self, act):
        i = int(act) % 3
        return {0: Actions.left, 1: Actions.right, 2: Actions.forward}[i]

# ---------- 2) 환경 생성 ----------
def make_env(render_mode=None, fully_obs=False, start=(1,1), goal=(17,17)):
    # env = gym.make(ENV_ID, render_mode=render_mode, max_steps=100)
    env = FourRoomsSmall(
        size=9,                  # 9/11/13 등으로 변경 가능
        agent_start_pos=(1,1),    # 시작 고정
        goal_pos=(7,7),           # 목표 고정
        start_dir=0,
        max_steps=200,
        render_mode=render_mode
    )
    env = FixedStartGoal(env, start=start, goal=goal, layout_seed=123, start_dir=0)
    if fully_obs:
        env = FullyObsWrapper(env)
    env = OnlyNavActions(env)
    env = RGBImgObsWrapper(env)
    env = ImgObsWrapper(env)
    env = StepPenalty(env, step_penalty=0.01)   # 최단 스텝 유도
    return env

# ---------- 3) 주기적 녹화 콜백(자막 1회만) ----------
class RecordEveryNEpisodes(BaseCallback):
    def __init__(self, every_episodes=50, video_root="videos", start=(1,1), goal=(17,17)):
        super().__init__(); self.every=every_episodes; self.ep=0; self.last=0
        self.video_root = video_root; os.makedirs(video_root, exist_ok=True)
        self.start, self.goal = start, goal

    def _burn_text(self, in_mp4, text, out_mp4):
        draw = ("drawtext=text='" + text.replace(":", "\\:") +
                "':x=20:y=h-50:fontsize=10:fontcolor=white:box=1:boxcolor=black@0.55:boxborderw=10")
        subprocess.run(["ffmpeg","-y","-i",in_mp4,"-vf",draw,"-an",out_mp4], check=True)
        
    def _on_step(self):
        dones = self.locals.get("dones")
        if dones is None: return True
        n_done = int(dones.sum()) if hasattr(dones, "sum") else int(bool(dones))
        if n_done == 0: return True

        self.ep += n_done
        if self.ep - self.last < self.every: return True

        tag = f"ep_{self.ep}_ts_{self.num_timesteps}"
        out_dir = os.path.join(self.video_root, tag)
        if os.path.exists(out_dir): shutil.rmtree(out_dir)

        eval_env = RecordVideo(
            make_env(render_mode="rgb_array", start=self.start, goal=self.goal),
            video_folder=out_dir,
            episode_trigger=lambda e: e == 0,
            name_prefix=f"eval_{tag}"
        )
        obs, _ = eval_env.reset()  # 레이아웃/시작/목표는 FixedStartGoal이 고정

        # 평가(확률 정책 + 정지 방지)
        ep_reward, done, ep_steps = 0.0, False, 0
        last_pos = getattr(eval_env.unwrapped, "agent_pos", None)
        still, MAX_STILL = 0, 4
        while not done:
            action, _ = self.model.predict(obs, deterministic=False)
            obs, r, t, tr, _ = eval_env.step(action)
            ep_reward += r; ep_steps += 1
            cur_pos = getattr(eval_env.unwrapped, "agent_pos", None)
            if np.array_equal(cur_pos, last_pos):
                still += 1
                if still >= MAX_STILL:
                    obs, _, t2, tr2, _ = eval_env.step(Actions.right)
                    ep_steps += 1
                    t |= t2; tr |= tr2; still = 0
            else:
                still = 0
            last_pos = cur_pos
            done = t or tr
        eval_env.close()

        mp4s = sorted([p for p in glob.glob(os.path.join(out_dir,"*.mp4"))
                       if not p.endswith("_labeled.mp4")])
        if mp4s:
            status = "SUCCESS" if ep_reward > 0 else "FAIL"
            text = f"EP={self.ep}  STEPS={ep_steps} TS={self.num_timesteps} R={ep_reward:.2f}, {status}"
            labeled = mp4s[0].replace(".mp4","_labeled.mp4")
            self._burn_text(mp4s[0], text, labeled)
            os.remove(mp4s[0])

        print(f"[RECORDED] {tag} R={ep_reward:.2f} steps={ep_steps} → {out_dir}")
        self.last = self.ep
        return True

# ---------- 4) 최단 경로(BFS) 계산해 검증 ----------
def bfs_shortest_unwrapped(env_unwrapped):
    grid = env_unwrapped.grid
    W, H = grid.width, grid.height
    sx, sy = tuple(env_unwrapped.agent_pos)
    # goal 찾기
    gx = gy = None
    for x in range(W):
        for y in range(H):
            obj = grid.get(x, y)
            if obj is not None and getattr(obj, "type", None) == "goal":
                gx, gy = x, y
    assert gx is not None
    from collections import deque
    q = deque([(sx, sy)]); dist = {(sx,sy):0}
    def passable(x,y):
        o = grid.get(x,y)
        return (o is None) or (getattr(o,"type",None)=="goal")
    while q:
        x,y = q.popleft()
        if (x,y)==(gx,gy): return dist[(x,y)]
        for dx,dy in [(1,0),(-1,0),(0,1),(0,-1)]:
            nx,ny = x+dx, y+dy
            if 0<=nx<W and 0<=ny<H and passable(nx,ny) and (nx,ny) not in dist:
                dist[(nx,ny)] = dist[(x,y)]+1; q.append((nx,ny))
    return None

# ================== 실행 ==================
if __name__ == "__main__":
    os.makedirs(VIDEO_DIR, exist_ok=True)

    # 고정 좌표 지정(벽이면 래퍼가 인접 빈칸으로 자동 보정)
    START = (1,1)
    GOAL  = (7,7)

    # 훈련
    train_env = make_env(start=START, goal=GOAL)
    model = PPO("CnnPolicy", train_env, verbose=1)
    cb = RecordEveryNEpisodes(every_episodes=20, video_root=VIDEO_DIR, start=START, goal=GOAL)
    model.learn(total_timesteps=30000, callback=cb)

    # 라벨본만 모아 타임라인 합치기
    labeled = []
    for root, _, fs in os.walk(VIDEO_DIR):
        for f in fs:
            if f.startswith("eval_") and f.endswith("_labeled.mp4"):
                m = re.search(r"ep_(\d+)", f); ep = int(m.group(1)) if m else 0
                labeled.append((ep, os.path.join(root, f)))
    labeled.sort(key=lambda x: x[0])
    if labeled:
        list_path = "list.txt"
        with open(list_path,"w") as f:
            for _,p in labeled: f.write(f"file '{os.path.abspath(p)}'\n")
        subprocess.run([
        "ffmpeg","-y",
        "-f","concat","-safe","0",
        "-i",list_path,
        "-filter:v","setpts=0.5*PTS",
        "-an",
        OUT_FILE], check=True)
        print(f"학습 타임라인: {OUT_FILE}")
    else:
        print("라벨본 없음")

    # 최단 경로 길이와 정책 성능 비교
    check_env = make_env(render_mode=None, start=START, goal=GOAL)
    check_env.reset()
    shortest = bfs_shortest_unwrapped(check_env.unwrapped)
    print(f"BFS shortest steps = {shortest}")

    # 학습 정책으로 실행해 실제 스텝 측정
    eval_env = make_env(render_mode="rgb_array", start=START, goal=GOAL)
    obs, _ = eval_env.reset()
    steps, done = 0, False
    while not done and steps < 500:
        act, _ = model.predict(obs, deterministic=True)
        obs, r, t, tr, _ = eval_env.step(act)
        steps += 1
        done = t or tr
    eval_env.close()
    print(f"Policy steps = {steps}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -2       |
| time/              |          |
|    fps             | 282      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------


ffmpeg version 5.0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 10.3.0 (conda-forge gcc 10.3.0-16)
  configuration: --prefix=/home/isjo/miniconda3/envs/test_ship --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1653042464189/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-demuxer=dash --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-vaapi --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1653042464189/_build_env/bin/pkg-config
  libavutil      57. 17.100 / 57. 17.100
  libavcodec     59. 18.100 / 59. 18.100
  libavformat    59. 16.100 / 59. 16.100
  libavdevice    59.  4.100 / 59.  4.100
  libavfilter     8. 24.100 /  8. 24.100
  libswscale    

CalledProcessError: Command '['ffmpeg', '-y', '-i', 'videos/ep_20_ts_4000/eval_ep_20_ts_4000-episode-0.mp4', '-vf', "drawtext=text='EP=20  STEPS=200 TS=4000 R=-1.72, FAIL':x=20:y=h-50:fontsize=10:fontcolor=white:box=1:boxcolor=black@0.55:boxborderw=10", '-an', 'videos/ep_20_ts_4000/eval_ep_20_ts_4000-episode-0_labeled.mp4']' returned non-zero exit status 1.