In [1]:
import numpy as np

# 簡單模擬DiscreteEnv類別，作為CliffWalkingEnv的基底類別
class DiscreteEnv:
    def __init__(self, nS, nA, P, isd):
        self.nS = nS    # 狀態數量(state space size)
        self.nA = nA    # 動作數量(action space size)
        # 狀態轉移機率字典：P[state][action] = [(prob, next_state, reward, done), ...]
        self.P = P
        self.isd = isd  # 初始狀態分布(initial state distribution)
        self.s = None   # 目前狀態

    def reset(self):
        # 將環境重設為初始狀態，取概率最大的初始狀態作為起點
        self.s = np.argmax(self.isd)
        return self.s

    def step(self, action):
        # 執行動作，取得下一狀態、獎勵、是否結束
        # 取第一個轉移結果(本環境是deterministic)
        prob, next_state, reward, done = self.P[self.s][action][0]
        self.s = next_state  # 更新狀態
        return next_state, reward, done, {}

# CliffWalkingEnv，繼承DiscreteEnv，定義懸崖步行環境
class CliffWalkingEnv(DiscreteEnv):
    # 動作定義
    UP, RIGHT, DOWN, LEFT = 0, 1, 2, 3

    def _limit_coordinates(self, coord):
        # 限制座標不超出邊界(0 <= coord < shape)
        coord[0] = min(max(coord[0], 0), self.shape[0] - 1)
        coord[1] = min(max(coord[1], 0), self.shape[1] - 1)
        return coord

    def _calculate_transition_prob(self, current, delta):
        # 根據當前位置current和動作偏移delta計算下一狀態及其屬性
        new_position = np.array(current) + np.array(delta)
        new_position = self._limit_coordinates(new_position).astype(int)  # 限制邊界後轉為整數
        new_state = np.ravel_multi_index(tuple(new_position), self.shape) # 轉換成扁平狀態編號
        reward = -100.0 if self._cliff[tuple(new_position)] else -1.0     # 掉懸崖懲罰-100，否則-1
        # 掉懸崖或到達終點時結束
        is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3, 11))
        return [(1.0, new_state, reward, is_done)]  # 返回只有一個deterministic transition

    def __init__(self):
        self.shape = (4, 12)      # 環境格子大小：4列12欄
        nS = np.prod(self.shape)  # 總狀態數 = 4*12 = 48
        nA = 4                    # 動作數量：上下左右四個方向
        # 初始化懸崖格子布林陣列，False表示非懸崖
        self._cliff = np.zeros(self.shape, dtype = bool)
        self._cliff[3, 1:-1] = True  # 將最後一列中間格子標記為懸崖(3列，第1到倒數第2欄)

        # 建立狀態轉移機率字典P
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)  # 將扁平狀態s轉成二維座標
            P[s] = {
                self.UP: self._calculate_transition_prob(position, [-1, 0]),
                self.RIGHT: self._calculate_transition_prob(position, [0, 1]),
                self.DOWN: self._calculate_transition_prob(position, [1, 0]),
                self.LEFT: self._calculate_transition_prob(position, [0, -1])
            }

        isd = np.zeros(nS)                                   # 初始狀態分布陣列
        isd[np.ravel_multi_index((3, 0), self.shape)] = 1.0  # 初始狀態是(3, 0)格子，機率1
        super().__init__(nS, nA, P, isd)                     # 呼叫父類別初始化
        self.reset()                                         # 重設環境狀態為初始狀態

    def render(self):
        # 印出當前環境狀態
        for s in range(self.nS):
            # 狀態轉座標
            position = np.unravel_index(s, self.shape)
            if self.s == s:
                output = " x "  # 目前agent所在位置
            elif position == (3, 11):
                output = " T "  # 目標終點
            elif self._cliff[position]:
                output = " C "  # 懸崖位置
            else:
                output = " o "  # 普通格子

            # 自動換行：每列最後一格印完換行
            if position[1] == 0:
                output = output.lstrip()
            if position[1] == self.shape[1] - 1:
                output = output.rstrip()
                output += "\n"
            print(output, end='')
        print()  # 末尾換行

# 測試程式，建立環境並渲染
env = CliffWalkingEnv()
env.render()

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

