In [1]:
import numpy as np

# 定義簡單的格子世界環境
# S：起始點
# F：一般方塊
# G：目標方塊
# H：陷阱方塊
# 可行動作：上（0）、右（1）、下（2）、左（3）
grid_world = [
    ['S', 'F', 'H'],
    ['F', 'H', 'F'],
    ['F', 'F', 'G']
]

# 設置環境參數
num_episodes = 1000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.01

# 取得環境尺寸
num_rows = len(grid_world)
num_cols = len(grid_world[0])
num_actions = 4  # 上、右、下、左

# 建立 Q-table（狀態-動作價值表）
q_table = np.zeros((num_rows, num_cols, num_actions))

# Q-learning 算法
for episode in range(num_episodes):
    state = (0, 0)  # 起始狀態

    for step in range(max_steps_per_episode):
        # 使用 ε-貪心策略選擇行動
        exploration_threshold = np.random.uniform(0, 1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state])
        else:
            action = np.random.randint(0, num_actions)

        # 執行選擇的行動並觀察新狀態和獎勵
        new_state = state
        if action == 0 and state[0] > 0:  # 上
            new_state = (state[0] - 1, state[1])
        elif action == 1 and state[1] < num_cols - 1:  # 右
            new_state = (state[0], state[1] + 1)
        elif action == 2 and state[0] < num_rows - 1:  # 下
            new_state = (state[0] + 1, state[1])
        elif action == 3 and state[1] > 0:  # 左
            new_state = (state[0], state[1] - 1)

        # 更新 Q-table
        if grid_world[new_state[0]][new_state[1]] == 'G':  # 到達目標
            q_table[state][action] = 100
            break
        elif grid_world[new_state[0]][new_state[1]] == 'H':  # 陷阱
            q_table[state][action] = -100
            break
        else:
            # Q-learning 更新公式
            q_table[state][action] = q_table[state][action] + learning_rate * (
                0 + discount_rate * np.max(q_table[new_state]) - q_table[state][action]
            )

        state = new_state

    # 降低探索率
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

# 印出最終的 Q-table
print("Final Q-table:")
print(q_table)


Final Q-table:
[[[  85.46285278   32.30944661   97.0299       84.07688756]
  [   0.79857279 -100.         -100.           64.57577892]
  [   0.            0.            0.            0.        ]]

 [[  79.16595172 -100.           98.01         69.53096376]
  [   0.            0.            0.            0.        ]
  [   0.            0.            0.            0.        ]]

 [[  87.0454805    99.           94.50159517   92.35883193]
  [-100.          100.           86.9639112    87.1404317 ]
  [   0.            0.            0.            0.        ]]]


In [1]:
import random
import gym
from time import sleep
 
env = gym.make('Taxi-v3') #创建出租车游戏环境
env.render() # 用于渲染出当前的智能体以及环境的状态
 
# 将Q表初始化为一个字典，它存储指定在状态s中执行动作a的值的状态-动作对。
q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        q[(s,a)] = 0.0
 
# 定义一个名为update_q_table的函数，根据q学习更新规则更新q值
def update_q_table(prev_state, action, reward, nextstate, alpha, gamma):
    qa = max([q[(nextstate, a)] for a in range(env.action_space.n)]) # 取一个状态-动作对的最大值，并将其存储在一个名为qa的变量中
    # max Q(s', a')
    q[(prev_state, action)] += alpha * (reward + gamma * qa - q[(prev_state, action)]) # 用更新规则更新前一个状态的Q值
    # Q(s, a) <- Q(s, a) + alpha (r + gamma max Q(s', a') - Q(s, a))
 
# epsilon贪心策略函数
def epsilon_greedy_policy(state, epsilon):
    if random.uniform(0,1) < epsilon:
        return env.action_space.sample() # 随机，用epsilon概率探索新动作
    else:
        return max(list(range(env.action_space.n)), key = lambda x: q[(state,x)]) # 用1-epsilon的概率选择Q表最佳动作
 
# 初始化变量
alpha = 0.4  # TD学习率
gamma = 0.999 # 折扣率
epsilon = 0.017 # 贪心策略中epsilon的值
num_episodes = 1000 # 玩几局游戏
 
# 执行Q-Learning
for episode in range(num_episodes): # 玩几局游戏
    steps, r = 0, 0 # 每局走多少步，总体奖励
    prev_state = env.reset() # 用于重置环境
    while True:
        steps += 1 # 每局走多少步
        env.render() # 用于渲染出当前的智能体以及环境的状态
        # In each state, we select the action by epsilon-greedy policy
        action = epsilon_greedy_policy(prev_state, epsilon)
        # then we perform the action and move to the next state, and receive the reward
        nextstate, reward, done, _ = env.step(action)
        # Next we update the Q value using our update_q_table function
        # which updates the Q value by Q learning update rule
        update_q_table(prev_state, action, reward, nextstate, alpha, gamma)
        # Finally we update the previous state as next state
        prev_state = nextstate # s <- s'
        # Store all the rewards obtained
        r += reward # reward: 即时奖励, r: total reward
        # we will break the loop, if we are at the terminal state of the episode
        if done:
            break
    print(f"Episode: {episode + 1}") # 玩几局游戏
    print(f"Epochs: {steps}") # 每局走多少步
    print(f"State: {prev_state}")
    print(f"Action: {action}")
    print(f"Reward: {reward}")
    print("Total Reward: ", r)
    # sleep(0.01) # 为了让显示变慢，否则画面会非常快
env.close()

ResetNeeded: Cannot call `env.render()` before calling `env.reset()`, if this is a intended action, set `disable_render_order_enforcing=True` on the OrderEnforcer wrapper.