# CartPole 強化學習實作(使用Stable Baselines3 套件)

In [1]:
# 匯入numpy套件，用於數值計算與陣列處理
import numpy as np

# 解決numpy版本更新後移除np.bool和np.bool8的問題
if not hasattr(np, 'bool'):   # 如果np模組中沒有bool屬性，則將其指向內建的bool類型
    np.bool = bool
if not hasattr(np, 'bool8'):  # 如果np模組中沒有bool8屬性，則將其指向np.bool
    np.bool8 = np.bool

# 匯入warnings模組，用於忽略不必要的警告訊息
import warnings

# 忽略所有的警告訊息，讓輸出結果更乾淨
warnings.filterwarnings("ignore")

# 匯入gymnasium套件，用於建立和操作強化學習環境
import gymnasium as gym

# 匯入A2C演算法，用於執行Actor-Critic強化學習訓練
from stable_baselines3 import A2C

# 匯入make_vec_env函式，用於建立多重環境以進行並行訓練
from stable_baselines3.common.env_util import make_vec_env

# 匯入SubprocVecEnv類別，用於在多個子進程中平行運行環境
from stable_baselines3.common.vec_env import SubprocVecEnv

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
# 建立一個包含4個並行實例的CartPole-v1環境
vec_env = make_vec_env("CartPole-v1", n_envs = 4)

# 建立A2C模型，使用多層感知器策略(MlpPolicy)作為策略網路
model = A2C("MlpPolicy", vec_env, verbose = 0)

# 開始訓練模型，總訓練步數為10000
model.learn(total_timesteps = 10000)

# 儲存訓練完成的模型
model.save("a2c_cartpole")

In [3]:
# 刪除模型以釋放記憶體資源
del model

# 測試用布林值列表，計算其中True的總數
test = [False, False, False, False]

# 輸出True的個數(True = 1，False = 0)
sum(test)

0

In [4]:
# 載入先前儲存的A2C模型
model = A2C.load("a2c_cartpole")

# 用於紀錄每回合的總報酬(每個回合是4個環境的獎勵總和)
all_rewards = []

# 初始化當前回合的總報酬(用numpy陣列，對應4個環境)
total_rewards = np.zeros(4)

# 重置並行環境並取得初始狀態
obs = vec_env.reset()

# 設定測試的回合數(預設為100)，每個回合會執行完整環境直到完成(直到done為True)
test_episodes = 100

# 設定每回合最大步數限制(預設為2000)，避免無限迴圈問題
max_steps_per_episode = 2000

# 記錄目前回合已執行的步數
step_count = 0

# 持續執行測試直到所有回合結束，確保測試完整
while test_episodes > 0:
    # 使用模型根據觀測值選擇動作(deterministic = True表示採用確定性策略)
    action, _state = model.predict(obs, deterministic = True)
    
    # 根據選定的動作與環境互動，取得新的觀測值、獎勵、完成狀態與額外資訊
    obs, reward, done, info = vec_env.step(action)
    
    # 累積本回合的獎勵
    total_rewards += reward
    
    # 計算目前回合的步數
    step_count += 1

    # 若所有環境回合結束(4個環境皆done)或達到最大步數限制，則輸出本回合的累積獎勵
    if sum(done) == 4 or step_count >= max_steps_per_episode:
        print("Episode", 101 - test_episodes, "reward:", total_rewards.sum())
        
        # 將本回合累積的總報酬加入紀錄清單中
        all_rewards.append(total_rewards.sum())
        
        # 初始化當前回合的總報酬(用numpy陣列，對應4個環境)
        total_rewards = np.zeros(4)

        # 重置步數計數器，開始新回合的步數計算
        step_count = 0

        # 將剩餘的測試回合數減1，避免無限迴圈
        test_episodes -= 1

# 關閉環境並釋放資源
vec_env.close()

Episode 1 reward: 8000.0
Episode 2 reward: 8000.0
Episode 3 reward: 8000.0
Episode 4 reward: 8000.0
Episode 5 reward: 8000.0
Episode 6 reward: 8000.0
Episode 7 reward: 8000.0
Episode 8 reward: 8000.0
Episode 9 reward: 8000.0
Episode 10 reward: 8000.0
Episode 11 reward: 8000.0
Episode 12 reward: 8000.0
Episode 13 reward: 8000.0
Episode 14 reward: 8000.0
Episode 15 reward: 8000.0
Episode 16 reward: 8000.0
Episode 17 reward: 8000.0
Episode 18 reward: 8000.0
Episode 19 reward: 8000.0
Episode 20 reward: 8000.0
Episode 21 reward: 8000.0
Episode 22 reward: 8000.0
Episode 23 reward: 8000.0
Episode 24 reward: 8000.0
Episode 25 reward: 8000.0
Episode 26 reward: 8000.0
Episode 27 reward: 8000.0
Episode 28 reward: 8000.0
Episode 29 reward: 8000.0
Episode 30 reward: 8000.0
Episode 31 reward: 8000.0
Episode 32 reward: 8000.0
Episode 33 reward: 8000.0
Episode 34 reward: 8000.0
Episode 35 reward: 8000.0
Episode 36 reward: 8000.0
Episode 37 reward: 8000.0
Episode 38 reward: 8000.0
Episode 39 reward: 80

In [5]:
# 程式進入點，確保子進程能正確啟動(避免Windows等系統問題)
if __name__ == "__main__":
    # 建立4個子進程平行運行的CartPole-v1環境
    env = make_vec_env("CartPole-v1", n_envs = 4, vec_env_cls = SubprocVecEnv)

    # 建立A2C模型，使用多層感知器策略，指定運算設備為CPU
    model = A2C("MlpPolicy", env, device="cpu")

    # 執行訓練(總共進行10000個時間步)
    model.learn(total_timesteps = 10000)